def annotate_important_features(self, text, features): toks = text.split() features = self.trim_feature_prefixes(features) result = [] for tok in toks: variants = [tok] + [an['lemma'] for an in analyze([tok])[0]['analysis']] variants = [self._unifier.unify(v.lower()) for v in variants] match = self.does_match(variants, features) if match is not None: tok = self.annotate_color(tok, match) result.append(tok) return ' '.join(result)
def morphy(word): """Performs morphological analysis on the `word`. Parameters ---------- word : str Word to be lemmatized. Returns ------- str Lemma of the `word`. """ analyzed = analyze([word]) return analyzed[-1]['analysis'][0]['lemma'] if len(analyzed) else None
def return_word_stats(keyword_in,sent): out={"POS":None,"form":None,"lemma":None,"ending":None,"root":None,"root_tokens":None} analyze_output_current = analyze(sent) for val in analyze_output_current: est_punctuation = '„”' temp = val["text"].lower() temp = temp.strip(string.punctuation + est_punctuation) if temp == keyword_in.lower(): analysis = val["analysis"][0] out["POS"] = analysis["partofspeech"] out["form"] = analysis["form"] out["ending"] = analysis["ending"] out["lemma"] = analysis['lemma'].split("|")[0] out["root"] = analysis["root"] out["root_tokens"] = analysis["root_tokens"] break return out
def filter_words(word, origsentence, model, stats): output = {"similar": [], "synonyms": []} unique_words = [] if word.lower() in model.vocab: print("using the wordvec for word ", word.lower()) similar_words = model.most_similar(word.lower(), topn=200) else: analyze_output_temp = analyze(origsentence) lemma_words = get_lemma_and_root_word(word, analyze_output_temp) for each_one in lemma_words: if each_one in model.vocab: print("using the wordvec for word ", word) similar_words = model.most_similar(each_one, topn=200) break # print ("similar_words ",similar_words) for each_word in similar_words: current_word = each_word[0] unique_words.append(current_word.lower()) unique_words = list(OrderedDict.fromkeys(unique_words)) # print ("unique_words ",unique_words) # Remove punctuation unique_words = remove_punctuation_words(unique_words) for unique_word in unique_words: if word.lower() in unique_word or unique_word.startswith(stats['root']): # unique_word = unique_word.title() output["similar"].append(unique_word) else: # unique_word = unique_word.title() output["synonyms"].append(unique_word) # text = Text(word) # original_word = text.lemmas[0] # original_word = original_word.split("|")[0] syn, sim, lemma_and_root = get_lemmas(output["synonyms"], word, origsentence) output["synonyms"] = syn output["similar"].extend(sim) output["lemma"] = lemma_and_root # pprint.pprint(lemmas) return output
def get_lemmas(filtered_words,originalword,originalsentence): out ={} out[originalword] = None analyze_output = analyze(originalsentence) lemmas_and_roots =get_lemma_and_root_word(originalword,analyze_output) unique_lemma_words=[] extra_lemma_words =[] for word in filtered_words: text = Text(word) try: lemma_word = text.roots[0] lemma_word = lemma_word.split("|")[0] if lemma_word not in out: out[lemma_word]=None if any(new_word.lower() in lemma_word.lower() for new_word in lemmas_and_roots): extra_lemma_words.append(word) else: unique_lemma_words.append(word) else: extra_lemma_words.append(word) except: print ("error for lemma word",word) # out[word] = lemma_word return unique_lemma_words,extra_lemma_words,lemmas_and_roots
# -*- coding: utf-8 -*- '''Morphological analysis/synthesis example.''' from __future__ import unicode_literals, print_function from estnltk import analyze from pprint import pprint pprint(analyze('Tüünete öötööde allmaaraudteejaam')) from estnltk import Tokenizer from estnltk import PyVabamorfAnalyzer tokenizer = Tokenizer() analyzer = PyVabamorfAnalyzer() text = '''Keeletehnoloogia on arvutilingvistika praktiline pool. Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud teooriaid, et luua rakendusi (nt arvutiprogramme), mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks masintõlge, arvutileksikoloogia, dialoogisüsteemid, kõneanalüüs ja kõnesüntees. ''' # first tokenize and then morphologically analyze morf_analyzed = analyzer(tokenizer(text)) # print some results print (morf_analyzed.lemmas)
literal_regexp = re.compile("\s+2\s+LITERAL\s\"(.+)\"") sense_regexp = re.compile("\s+3\s+SENSE\s+(\d+)") with codecs.open('%s'%argv[1],'r',encoding='utf-8') as fin, codecs.open("../synset_to_lemma.txt",'w',encoding='utf-8') as fout: for line in fin: result = syn_idx_regexp.match(line) if result: syn_idx = result.group(1) continue result = pos_regexp.match(line) if result: pos = result.group(1) continue result = literal_regexp.match(line) if result: literal = result.group(1) continue result = sense_regexp.match(line) if result: sense = result.group(1) lemma_product = analyze([literal])[0] for candidate in lemma_product['analysis']: form = candidate['form'] lemma = candidate['lemma'] cand_pos = candidate['partofspeech'] fout.write("%s@%s:%s:%02d@%s:%s:%s\n"%(syn_idx,pos,literal,int(sense),lemma,form,cand_pos))
def _lemmatize(self, sentence): return list(set([analysis['lemma'] for wa in analyze(sentence) for analysis in wa['analysis']]))
# -*- coding: utf-8 -*- '''Morphological analysis/synthesis example.''' from __future__ import unicode_literals, print_function from estnltk import analyze from pprint import pprint pprint(analyze('Tüünete öötööde allmaaraudteejaam')) from estnltk import Tokenizer from estnltk import PyVabamorfAnalyzer tokenizer = Tokenizer() analyzer = PyVabamorfAnalyzer() text = '''Keeletehnoloogia on arvutilingvistika praktiline pool. Keeletehnoloogid kasutavad arvutilingvistikas välja töötatud teooriaid, et luua rakendusi (nt arvutiprogramme), mis võimaldavad inimkeelt arvuti abil töödelda ja mõista. Tänapäeval on keeletehnoloogia tuntumateks valdkondadeks masintõlge, arvutileksikoloogia, dialoogisüsteemid, kõneanalüüs ja kõnesüntees. ''' # first tokenize and then morphologically analyze morf_analyzed = analyzer(tokenizer(text)) # print some results print(morf_analyzed.lemmas) print(morf_analyzed.postags)