def index(request,phrase_list): print("INDEX with phrase list:",phrase_list) a = Lemmatizer() lemmatised_list = [] for i,word in enumerate(phrase_list): if i: lemmatised_list.append(a.lemmatize(word) + " ") return Response(json.dumps(lemmatised_list))
def __init__(self): this_dir = os.path.dirname(os.path.abspath(__file__)) self.lemmatizer = Lemmatizer() dir = os.path.join(this_dir, "tokenizers/slovene.pickle") self.sent_detector = nltk.data.load("file://" + dir) self.stopwords = open( os.path.join(this_dir, "tokenizers/stopwords.txt"), "rb").read().splitlines() self.stopwords = filter(lambda w: not w.startswith("#"), self.stopwords) # Convert to unicode self.stopwords = [word.decode("utf-8") for word in self.stopwords]
class Summarizer(): def __init__(self): this_dir = os.path.dirname(os.path.abspath(__file__)) self.lemmatizer = Lemmatizer() dir = os.path.join(this_dir, "tokenizers/slovene.pickle") self.sent_detector = nltk.data.load("file://" + dir) self.stopwords = open( os.path.join(this_dir, "tokenizers/stopwords.txt"), "rb").read().splitlines() self.stopwords = filter(lambda w: not w.startswith("#"), self.stopwords) # Convert to unicode self.stopwords = [word.decode("utf-8") for word in self.stopwords] def summarize(self, article_text, num_sentences=DEFAULT_SUMMARIZATION_NUMBER): # Get words from article words = word_tokenize(article_text) # Filter non-alphanumeric chars from words words = [filter(unicode.isalnum, word) for word in words] words = filter(lambda w: len(w) > 0, words) # Remove empty words # Now lemmatize all words words = [ self.lemmatizer.lemmatize(word).lower() for word in words if word.lower() not in self.stopwords ] word_frequencies = FreqDist(words) most_frequent = [word[0] for word in word_frequencies.items()[:100]] # Now get sentences sentences = self.sent_detector.tokenize(article_text) wordcountdict = defaultdict(int) for word in most_frequent: lem_word = self.lemmatizer.lemmatize(word).lower() for i in range(0, len(sentences)): if lem_word in sentences[i]: wordcountdict[i] += 1 sorted_wordcounts = sorted(wordcountdict.iteritems(), key=operator.itemgetter(1), reverse=True)[:num_sentences] return [sentences[num] for num, count in sorted_wordcounts]
def __init__(self, lemmatize=True): self.debug = False self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN) self.lemmatize = lemmatize self.stopwords = self.get_stopwords()
def _get_lemmatizer(language: str) -> Callable: if language in lemmagen_languages: return Lemmatizer( dictionary=lemmagen_languages[language.lower()] ).lemmatize else: return get_udipipe_lematizer(language)
def lemmatizeTokens(tokens): lemmatized_tokens = [] lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) for token in tokens: lemmatized_tokens.append(lemmatizer.lemmatize(token)) return lemmatized_tokens
def removeStopWordsAndLemmatisation(tokens): new_content = "" stop_words = set(stopwords.words('slovene')) for token in tokens: if type(token) == tuple: x = token[0] else: x = token # broken library / issue slovenian words have whitespaces behind x = x + ' ' if x.lower() not in stop_words: x = x.strip() lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) lemmanizedWord = lemmatizer.lemmatize(x) new_content += lemmanizedWord + " " return new_content
class Preprocessing: def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$') def preprocess(self, text, raw=False, keep_stop_words=False): # Tokenize tokens = word_tokenize(text) if not raw: # Lemmatize tokens = [self.lemmatizer.lemmatize(token) for token in tokens] # Convert to lowercase tokens = [t.lower() for t in tokens] if not keep_stop_words: # Remove stopwords and punctuations tokens = [ t for t in tokens if t not in stop_words_slovene and not self.punc_regex.match(t) ] return tokens
def test_emptystring(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("") self.assertEqual("", lemmatized)
def test_utf8lemmatize(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("čistijo") self.assertEqual("čistiti", lemmatized)
import argparse import nltk from itertools import groupby #import gensim #from gensim.models.doc2vec import TaggedDocument #from experimentation import compress import resource rsrc = resource.RLIMIT_AS soft, hard = resource.getrlimit(rsrc) resource.setrlimit( rsrc, (13500000000, hard)) #limit allowed Python memory usage to 13GB start_time = time.time() lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) chkr = SpellChecker("en_US") def variety_words(): l_us = read_wordList('word_lists/en_US.dic') l_ca = read_wordList('word_lists/en_CA.dic') l_au = read_wordList('word_lists/en_AU.dic') l_all = l_us & l_ca & l_au l_just_us = l_us - l_all l_just_ca = l_ca - l_all l_just_au = l_au - l_all return (l_just_us, l_just_ca, l_just_au) def generate_output(path, author_id, lang, variety, gender):
result.append(string) return result sl = [] en = [] with open('AGIF_small.tmx') as fp: xml = bs(fp, 'lxml-xml') for cnt, tuv in enumerate(xml.body.find_all('tuv')): if tuv.get('xml:lang') == 'en-GB': text = tuv.seg.getText().replace('\\n', ' ').replace( '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip() text = re.sub('\\.+', '.', text) text = ' '.join(text.split()).lower() en.append(text) elif tuv.get('xml:lang') == 'sl-SI': text = tuv.seg.getText().replace('\\n', ' ').replace( '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip() text = re.sub('\\.+', '.', text) text = ' '.join(text.split()).lower() sl.append(text) lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) sl_lemmas = get_lemmas(sl, lemmatizer_sl) for el in sl_lemmas: print(el)
def test_lemmatize(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize(str("hodimo")) self.assertEqual(str("hoditi"), lemmatized)
def test_null(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize(None) self.assertEqual(None, lemmatized)
def test_punctuation(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("!\"=`.,/:") self.assertEqual("!\"=`.,/:", lemmatized)
class Lemmatization(): def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) def lemmatize(self, token): return self.lemmatizer.lemmatize(token)
def createLemmatizedFeatures(data, giza_dict, giza_dict_reversed, cognates=False): lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) data['src_term_lemma'] = data['src_term'].map(lambda x: lemmatize(x, lemmatizer_en)) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) data['tar_term_lemma'] = data['tar_term'].map(lambda x: lemmatize(x, lemmatizer_sl)) data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma'] data['isFirstWordTranslated'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict)) data['isLastWordTranslated'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict)) data['percentageOfTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict)) data['percentageOfNotTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict)) data['longestTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict)) data['longestNotTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict)) data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma'] data['isFirstWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict_reversed)) data['isLastWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict_reversed)) data['percentageOfTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict_reversed)) data['percentageOfNotTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict_reversed)) data['longestTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict_reversed)) data['longestNotTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict_reversed)) data['src_term_tr'] = data['src_term'].map(lambda x: transcribe(x, 'en')) data['tar_term_tr'] = data['tar_term'].map(lambda x: transcribe(x, 'sl')) data['term_pair_tr'] = data['src_term_tr'] + '\t' + data['tar_term_tr'] data['term_pair'] = data['src_term'] + '\t' + data['tar_term'] if cognates: data['isFirstWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, 0)) data['isLastWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, -1)) data['longestCommonSubstringRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['longestCommonSubsequenceRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_subsequence(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['dice'] = data['term_pair_tr'].map(lambda x: (2 * float(len(longest_common_substring(x)))) / (len(x.split('\t')[0]) + len(x.split('\t')[1]))) data['NWD'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / min(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['editDistanceNormalized'] = data['term_pair_tr'].map(lambda x: 1 - (float(editdistance.eval(x.split('\t')[0], x.split('\t')[1])) / max(len(x.split('\t')[0]), len(x.split('\t')[1])))) data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma'] data['isFirstWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, 0)) data['isLastWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, -1)) data['percentageOfCoverage'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict)) data['percentageOfNonCoverage'] = data['term_pair_lemma'].map(lambda x: 1 -percentageOfCoverage(x, giza_dict)) data['diffBetweenCoverageAndNonCoverage'] = data['percentageOfCoverage'] - data['percentageOfNonCoverage'] if cognates: data['wordLengthMatch'] = data['term_pair'].map(lambda x: wordLengthMatch(x)) data['sourceTermLength'] = data['term_pair'].map(lambda x: sourceTermLength(x)) data['targetTermLength'] = data['term_pair'].map(lambda x: targetTermLength(x)) data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma'] data['isFirstWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, 0)) data['isLastWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, -1)) data['percentageOfCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict_reversed)) data['percentageOfNonCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: 1 - percentageOfCoverage(x, giza_dict_reversed)) data['diffBetweenCoverageAndNonCoverage_reversed'] = data['percentageOfCoverage_reversed'] - data['percentageOfNonCoverage_reversed'] data['averagePercentageOfTranslatedWords'] = (data['percentageOfTranslatedWords'] + data['percentageOfTranslatedWords_reversed']) / 2 data = data.drop(['term_pair', 'term_pair_lemma', 'src_term_lemma', 'tar_term_lemma', 'term_pair_tr', 'src_term_tr', 'tar_term_tr'], axis = 1) #print('feature construction done') return data
def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')
import sys sys.path.append("C:/Users/dis/Documents/JanJezersek/EkoSmart/pylemmagen") from lemmagen.lemmatizer import Lemmatizer a = Lemmatizer() for i, word in enumerate(sys.argv): if i: sys.stdout.write(a.lemmatize(word) + " ")
def arrangeLemmatizedData(input, lemmatization=False, reverse=False): dd = defaultdict(list) with openio(input, encoding='utf8') as f: for line in f: line = line.split() source, target, score = line[0], line[1], line[2] source = source.strip('`’“„,‘') target = target.strip('`’“„,‘') if lemmatization and not reverse: lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) source = lemmatizer_en.lemmatize(source) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) target= lemmatizer_sl.lemmatize(target) elif lemmatization and reverse: lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) source = lemmatizer_sl.lemmatize(source) lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) target = lemmatizer_en.lemmatize(target) dd[source].append((target, score)) for k, v in dd.items(): v = sorted(v, key=lambda tup: float(tup[1]), reverse = True) new_v = [] for word, p in v: if (len(k) < 4 and len(word) > 5) or (len(word) < 4 and len(k) > 5): continue if float(p) < 0.05: continue new_v.append((word, p)) dd[k] = new_v return dd
class Tokenizer(object): def __init__(self, lemmatize=True): self.debug = False self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN) self.lemmatize = lemmatize self.stopwords = self.get_stopwords() def get_stopwords(self): sw = StopWord() return set(sw.words) def lemstem(self, token): if self.lemmatize: return self.lemmatizer.lemmatize(token) else: return self.stemmer.stem(token) def extractTokens(self, text): try: tokens = word_tokenize(text) except UnicodeEncodeError: tokens = [] if not tokens: return {} est_text = self.is_estonian(text) token_dict = {} for token in tokens: token = token.lower() # check if string consists of alphabetic characters only if not (token.isalpha() and len(token) > 2): continue try: if est_text: lemstem_word = self.estLemmatizer.lemmatize(token) else: lemstem_word = self.lemstem(token) except Exception: lemstem_word = token if lemstem_word not in self.stopwords: if self.debug: print "{0}: {1}".format(token.encode('utf-8'), lemstem_word.encode('utf-8')) if token_dict.has_key(lemstem_word): token_dict[lemstem_word] += 1 else: token_dict[lemstem_word] = 1 return token_dict def is_estonian(self, text): est = False try: est = detect(text) == 'et' except Exception: pass return est def getLectureRecord(self, lectureId): try: data = Lecture.select().where(Lecture.id == lectureId).get() return data except Exception: return None def extractLectureTokens(self, lecture): if lecture is None: return False text = lecture.content tokens = self.extractTokens(text) sorted_tokens = sorted(tokens.items(), key=operator.itemgetter(1)) for token in sorted_tokens: try: with db.transaction() as txn: LectureWord.create( lecture=lecture, word=token[0], count=token[1], active=True, weight=0 ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for lecture {0}, word {1}, {2}".format(lecture.id, token[0], e) if self.debug: print token return True def getCourseRecord(self, courseId): try: data = Course.select().where(Course.id == courseId).get() return data except Exception: return None def getLectures(self, course): lectures = Lecture.select().where(Lecture.course == course) return list(lectures) def extractCourseTokens(self, lectures): print "Lecture count: {0}".format(len(lectures)) for lecture in lectures: print "Lecture: {0}".format(lecture.id) self.extractLectureTokens(lecture) def getCourses(self, courseId=0): if courseId: courses = Course.select().where(Course.id == courseId) else: courses = Course.select() return list(courses) def extractAllCourseTokens(self): for course in self.getCourses(): print course.id, course.name lectures = self.getLectures(course) self.extractCourseTokens(lectures) def getLectureWords(self, lecture): lectureWords = list(LectureWord.select().where(LectureWord.lecture == lecture)) return lectureWords def createCourseTokens(self): for course in self.getCourses(): print "{}: {}".format(course.id, course.name.encode('utf8')) token_dict = {} lecture_token = {} for lecture in self.getLectures(course): lectureWords = self.getLectureWords(lecture) for lectureWord in lectureWords: if not token_dict.has_key(lectureWord.word): token_dict[lectureWord.word] = 0 lecture_token[lectureWord.word] = 0 token_dict[lectureWord.word] += lectureWord.count lecture_token[lectureWord.word] += 1 sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1)) for token in sorted_tokens: try: with db.transaction() as txn: CourseWord.create( course=course, word=token[0], count=token[1], active=True, lectures=lecture_token[token[0]] ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for course {0}, word {1}, {2}".format(course.name.encode('utf8'), token[0].encode('utf8'), e) def getCourseWords(self, courseId=0): if courseId == 0: courseWords = CourseWord.select() else: courseWords = CourseWord.select().where(CourseWord.course == courseId) return list(courseWords) def createCorpusTokens(self): token_dict = {} for courseWord in self.getCourseWords(): if token_dict.has_key(courseWord.word): token_dict[courseWord.word] += courseWord.count else: token_dict[courseWord.word] = courseWord.count sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1)) for token in sorted_tokens: print token try: with db.transaction() as txn: CorpusWord.create( word=token[0], count=token[1], active=True ) txn.commit() except peewee.OperationalError as e: print "Could not create a record for word {}, {}".format(token[0], e) def calc_tf(self): for course in self.getCourses(55): print course.name for lecture in self.getLectures(course): maxCount = 0 for lectureWord in self.getLectureWords(lecture): maxCount = max(maxCount, lectureWord.count) for lectureWord in self.getLectureWords(lecture): try: with db.transaction(): lectureWord.weight = 0.5 + (0.5 * lectureWord.count) / maxCount lectureWord.save() except peewee.OperationalError as e: print e