def arrangeLemmatizedData(input, lemmatization=False, reverse=False): dd = defaultdict(list) with openio(input, encoding='utf8') as f: for line in f: line = line.split() source, target, score = line[0], line[1], line[2] source = source.strip('`’“„,‘') target = target.strip('`’“„,‘') if lemmatization and not reverse: lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) source = lemmatizer_en.lemmatize(source) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) target= lemmatizer_sl.lemmatize(target) elif lemmatization and reverse: lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) source = lemmatizer_sl.lemmatize(source) lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) target = lemmatizer_en.lemmatize(target) dd[source].append((target, score)) for k, v in dd.items(): v = sorted(v, key=lambda tup: float(tup[1]), reverse = True) new_v = [] for word, p in v: if (len(k) < 4 and len(word) > 5) or (len(word) < 4 and len(k) > 5): continue if float(p) < 0.05: continue new_v.append((word, p)) dd[k] = new_v return dd
def _get_lemmatizer(language: str) -> Callable: if language in lemmagen_languages: return Lemmatizer( dictionary=lemmagen_languages[language.lower()] ).lemmatize else: return get_udipipe_lematizer(language)
def lemmatizeTokens(tokens): lemmatized_tokens = [] lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) for token in tokens: lemmatized_tokens.append(lemmatizer.lemmatize(token)) return lemmatized_tokens
def index(request,phrase_list): print("INDEX with phrase list:",phrase_list) a = Lemmatizer() lemmatised_list = [] for i,word in enumerate(phrase_list): if i: lemmatised_list.append(a.lemmatize(word) + " ") return Response(json.dumps(lemmatised_list))
def __init__(self): this_dir = os.path.dirname(os.path.abspath(__file__)) self.lemmatizer = Lemmatizer() dir = os.path.join(this_dir, "tokenizers/slovene.pickle") self.sent_detector = nltk.data.load("file://" + dir) self.stopwords = open( os.path.join(this_dir, "tokenizers/stopwords.txt"), "rb").read().splitlines() self.stopwords = filter(lambda w: not w.startswith("#"), self.stopwords) # Convert to unicode self.stopwords = [word.decode("utf-8") for word in self.stopwords]
def removeStopWordsAndLemmatisation(tokens): new_content = "" stop_words = set(stopwords.words('slovene')) for token in tokens: if type(token) == tuple: x = token[0] else: x = token # broken library / issue slovenian words have whitespaces behind x = x + ' ' if x.lower() not in stop_words: x = x.strip() lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) lemmanizedWord = lemmatizer.lemmatize(x) new_content += lemmanizedWord + " " return new_content
def test_emptystring(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("") self.assertEqual("", lemmatized)
def createLemmatizedFeatures(data, giza_dict, giza_dict_reversed, cognates=False): lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) data['src_term_lemma'] = data['src_term'].map(lambda x: lemmatize(x, lemmatizer_en)) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) data['tar_term_lemma'] = data['tar_term'].map(lambda x: lemmatize(x, lemmatizer_sl)) data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma'] data['isFirstWordTranslated'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict)) data['isLastWordTranslated'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict)) data['percentageOfTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict)) data['percentageOfNotTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict)) data['longestTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict)) data['longestNotTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict)) data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma'] data['isFirstWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict_reversed)) data['isLastWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict_reversed)) data['percentageOfTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict_reversed)) data['percentageOfNotTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict_reversed)) data['longestTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict_reversed)) data['longestNotTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict_reversed)) data['src_term_tr'] = data['src_term'].map(lambda x: transcribe(x, 'en')) data['tar_term_tr'] = data['tar_term'].map(lambda x: transcribe(x, 'sl')) data['term_pair_tr'] = data['src_term_tr'] + '\t' + data['tar_term_tr'] data['term_pair'] = data['src_term'] + '\t' + data['tar_term'] if cognates: data['isFirstWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, 0)) data['isLastWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, -1)) data['longestCommonSubstringRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['longestCommonSubsequenceRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_subsequence(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['dice'] = data['term_pair_tr'].map(lambda x: (2 * float(len(longest_common_substring(x)))) / (len(x.split('\t')[0]) + len(x.split('\t')[1]))) data['NWD'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / min(len(x.split('\t')[0]), len(x.split('\t')[1]))) data['editDistanceNormalized'] = data['term_pair_tr'].map(lambda x: 1 - (float(editdistance.eval(x.split('\t')[0], x.split('\t')[1])) / max(len(x.split('\t')[0]), len(x.split('\t')[1])))) data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma'] data['isFirstWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, 0)) data['isLastWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, -1)) data['percentageOfCoverage'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict)) data['percentageOfNonCoverage'] = data['term_pair_lemma'].map(lambda x: 1 -percentageOfCoverage(x, giza_dict)) data['diffBetweenCoverageAndNonCoverage'] = data['percentageOfCoverage'] - data['percentageOfNonCoverage'] if cognates: data['wordLengthMatch'] = data['term_pair'].map(lambda x: wordLengthMatch(x)) data['sourceTermLength'] = data['term_pair'].map(lambda x: sourceTermLength(x)) data['targetTermLength'] = data['term_pair'].map(lambda x: targetTermLength(x)) data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma'] data['isFirstWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, 0)) data['isLastWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, -1)) data['percentageOfCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict_reversed)) data['percentageOfNonCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: 1 - percentageOfCoverage(x, giza_dict_reversed)) data['diffBetweenCoverageAndNonCoverage_reversed'] = data['percentageOfCoverage_reversed'] - data['percentageOfNonCoverage_reversed'] data['averagePercentageOfTranslatedWords'] = (data['percentageOfTranslatedWords'] + data['percentageOfTranslatedWords_reversed']) / 2 data = data.drop(['term_pair', 'term_pair_lemma', 'src_term_lemma', 'tar_term_lemma', 'term_pair_tr', 'src_term_tr', 'tar_term_tr'], axis = 1) #print('feature construction done') return data
import sys sys.path.append("C:/Users/dis/Documents/JanJezersek/EkoSmart/pylemmagen") from lemmagen.lemmatizer import Lemmatizer a = Lemmatizer() for i, word in enumerate(sys.argv): if i: sys.stdout.write(a.lemmatize(word) + " ")
import argparse import nltk from itertools import groupby #import gensim #from gensim.models.doc2vec import TaggedDocument #from experimentation import compress import resource rsrc = resource.RLIMIT_AS soft, hard = resource.getrlimit(rsrc) resource.setrlimit( rsrc, (13500000000, hard)) #limit allowed Python memory usage to 13GB start_time = time.time() lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) chkr = SpellChecker("en_US") def variety_words(): l_us = read_wordList('word_lists/en_US.dic') l_ca = read_wordList('word_lists/en_CA.dic') l_au = read_wordList('word_lists/en_AU.dic') l_all = l_us & l_ca & l_au l_just_us = l_us - l_all l_just_ca = l_ca - l_all l_just_au = l_au - l_all return (l_just_us, l_just_ca, l_just_au) def generate_output(path, author_id, lang, variety, gender):
result.append(string) return result sl = [] en = [] with open('AGIF_small.tmx') as fp: xml = bs(fp, 'lxml-xml') for cnt, tuv in enumerate(xml.body.find_all('tuv')): if tuv.get('xml:lang') == 'en-GB': text = tuv.seg.getText().replace('\\n', ' ').replace( '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip() text = re.sub('\\.+', '.', text) text = ' '.join(text.split()).lower() en.append(text) elif tuv.get('xml:lang') == 'sl-SI': text = tuv.seg.getText().replace('\\n', ' ').replace( '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip() text = re.sub('\\.+', '.', text) text = ' '.join(text.split()).lower() sl.append(text) lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH) lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) sl_lemmas = get_lemmas(sl, lemmatizer_sl) for el in sl_lemmas: print(el)
def test_lemmatize(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize(str("hodimo")) self.assertEqual(str("hoditi"), lemmatized)
def test_null(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize(None) self.assertEqual(None, lemmatized)
def test_punctuation(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("!\"=`.,/:") self.assertEqual("!\"=`.,/:", lemmatized)
def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
def test_utf8lemmatize(self): lemmatizer = Lemmatizer() lemmatized = lemmatizer.lemmatize("čistijo") self.assertEqual("čistiti", lemmatized)
def __init__(self): self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE) self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')