def assembleEntry(y): glosses = [] examples = [] etymologies = [] quotations = [] pronunciations = [] pronunciation_entries = set(); partsOfSpeech = [] partsOfSpeechHeads = [] etymology_entries = set(); synonyms = [] word_forms = [] # Preprocessing for entry in y.get('entries', []): # Parts of speech psos = entry.get('partsOfSpeech') or [] try: psos = map(lambda x: x.replace('proper_noun', 'proper noun'), psos) except: print(repr(psos)) print(y['title']) raise if psos: partsOfSpeech.append(u"<B>" + u" ,".join(psos) + u"</B>") partsOfSpeechHeads.append(psos[0]) else: partsOfSpeech.append("") partsOfSpeechHeads.append("") # Word forms elems = [] for wf in entry.get('wordForms') or []: form = wf.get('form') if form: elems.append(form) word_forms.append(elems) # Synonyms synonyms.append(clean_synonyms(entry.get('synonyms', []))) # Pronunciations elems = [] elem = "" # print(entry.get('pronunciations', [])) for pronunciation in entry.get('pronunciations', []): text = pronunciation.get('text') if text: if text not in pronunciation_entries: pronunciation_entries.add(text) elem += text note = pronunciation.get('note') if note: elem += " (" + note + ")" elems.append(elem) elem = "" pronunciations.append(", ".join(elems)) # print(repr(pronunciations[-1])) # Senses gloss_entry = [] example_entry = [] quote_entry = [] for sense in entry.get('senses') or []: gloss_entry.append(stripHtml(sense.get('gloss', ''))) example_entry.append([ replace_newlines(stripHtml(example.get('example', ''))) for example in sense.get('examples', [])]) quote_entry.append([ replace_newlines(stripHtml(quote.get('quote', ''))) for quote in sense.get('quotations', [])]) glosses.append(gloss_entry) examples.append(example_entry) quotations.append(quote_entry) etymology_text = stripHtml(entry.get('etymology', '')) if etymology_text not in etymology_entries: etymology_entries.add(etymology_text) etymologies.append(etymology_text) else: etymologies.append('') # Assemble string # Title s = u"" # s += y['title'] + "\t" # Pronunciations entry_pronuncs = False # pronunciations_filtered = [text for entry in pronunciations for text in entry] pronunciations_filtered = list(filter(None, pronunciations)) if len(pronunciations_filtered) == 1: s += u" " + pronunciations_filtered[0] + "<BR>" else: entry_pronuncs = True # Entries & glosses single_entry = len(glosses) == 1 for (entry_num, entry_glosses) in enumerate(glosses, 1): if entry_num >= 2: s += "<BR>" if not single_entry: s +=u"{0}. ".format(roman.int_to_roman(entry_num)) if entry_pronuncs: s += prep_string(pronunciations[entry_num - 1]) s += partsOfSpeech[entry_num - 1] # Handle word forms pos = partsOfSpeechHeads[entry_num - 1] word = y['title'] if pos == "verb": p = en.conjugate(word, 'p') pp = en.conjugate(word, 'ppart') if p != word + 'ed' or pp != word + 'ed': s += u" (p. " + p + u", pp. " + pp + u")" elif pos == "noun": pl = en.pluralize(word) if pl != word + u's': s += u" (pl. " + pl + ")" elif pos == "adjective": pass # Glosses single_gloss = len(entry_glosses) == 1 for (gloss_num, gloss) in enumerate(entry_glosses, 1): if not single_gloss: s += u" {0:d}.".format(gloss_num) # else: # s += u":" s += u" {0}".format(gloss) s += prep_string(", ".join(synonyms[entry_num - 1]) + u"." if synonyms[entry_num - 1] else "", " Synonyms: ") # s += prep_string(etymologies[entry_num - 1], u" Etymology: ") # Etymologies etymologies_filtered = [etym for etym in etymologies if etym] if etymologies_filtered: s += '<BR><BR><B>Etymology:</B>' if len(etymologies_filtered) == 1: s += etymologies_filtered[0] else: for i in range(0, len(glosses)): if etymologies[i]: s += u" {0}. {1}".format(roman.int_to_roman(i + 1), etymologies[i]) # Examples and Quotes examples_flat = [example for entry in examples for examples in entry for example in examples if example] if examples_flat: s += u"<BR><BR><B>Examples:</B>" for (num_example, example) in enumerate(examples_flat, 1): if len(examples_flat) == 1: s += " " + example else: s += u" {0:d}. {1}".format(num_example, example) quotes_flat = [quote for entry in quotations for quotes in entry for quote in quotes if quote] if quotes_flat: s += u"<BR><BR><B>Quotations:</B>" for (num_quote, quote) in enumerate(quotes_flat, 1): if len(quotes_flat) == 1: s += u" " + quote else: s += u" {0:d}. {1}".format(num_quote, quote) s = escape_characters(s) word_forms_flat = [form for entry in word_forms for form in entry if form] titles = [y['title']] titles.extend(word_forms_flat) if 'verb' in partsOfSpeechHeads: titles.extend(en.lexeme(y['title'])) if 'noun' in partsOfSpeechHeads: titles.append(en.pluralize(y['title'])) if 'adjective' in partsOfSpeechHeads: adj_forms = [en.comparative(y['title']), en.superlative(y['title'])] adj_forms = [form for form in adj_forms if len(form.split(' ')) == 1] titles.extend(adj_forms) titles = unique(titles) if s.strip() == "": s = "Empty article." s = u'|'.join(titles) + u"\n" + s.strip() # return escape_characters(contract_tabs(s)) return s
def getInflections(key): inflections = set() # print('"%s"' % key) if key.isalpha(): try: try: lexeme(key) except: pass inflections.add(lexeme(key)) # get all lexem inflections of words inflections.add(pluralize(key)) # add plural inflections inflections.intersection_update(wordlist) print(inflections) except: pass # print("Unexpected error") return inflections
def testBasic(): from pattern.en import referenced print referenced('hour') from pattern.en import conjugate, lemma, lexeme print lexeme('purr') print lemma('purring') print conjugate('purred', '3sg') # he / she / it
def tenseChecker(sentence,ind):#sentence is a list of words, ind = index of verb global threads trigrams=[] fourgrams=[] fourgrams=getFourgrams(sentence,ind) # print("OOOOKKK") # print(fourgrams) if fourgrams: # print("FOUR") for four,i in fourgrams: key = four[i] temp = four # print(tri) word_tenses=lexeme(key) # print(word_tenses) for tense in word_tenses: temp[i]=tense # print(temp) t = threading.Thread(target = threadOutput, args = (temp,tense,False, )) # t.setDaemon(True) t.start() threads.append(t) for t in threads: t.join() sorted_tenses_4 = sorted(fourgram_freq.items(), key = lambda x:x[1], reverse=True) # print(sorted_tenses_4) return sorted_tenses_4 else: # trigram_freq1={} # print("THREE") trigrams=getTrigrams(sentence,ind) # print(trigrams) for tri,i in trigrams: key = tri[i] temp = tri # print(tri) word_tenses=lexeme(key) # print(word_tenses) for tense in word_tenses: temp[i]=tense # print(temp) t = threading.Thread(target = threadOutput, args = (temp,tense,True, )) # t.setDaemon(True) t.start() threads.append(t) for t in threads: t.join() sorted_tenses = sorted(trigram_freq1.items(), key = lambda x:x[1], reverse=True) # print(sorted_tenses) return sorted_tenses
def test_lexeme(self): # Assert all inflections of "be". v = en.lexeme("be") self.assertEqual(v, [ "be", "am", "are", "is", "being", "was", "were", "been", "am not", "aren't", "isn't", "wasn't", "weren't" ]) v = en.lexeme("imaginerify") self.assertEqual(v, [ "imaginerify", "imaginerifies", "imaginerifying", "imaginerified" ]) print "pattern.en.inflect.lexeme()"
def verbs_to_does(textlist): newlist = [] text = join_temp_text(textlist) # change to incorporate pattern also? doc = nlp(text) for idx, token in enumerate(doc): # print(token, token.pos_) if token.pos_ == 'VERB' and ((idx != 0 and token.nbor(-1).text not in ["to","are","is"] and not token.nbor(-1).pos_ == "DET") or (idx == 0)) and not token.text.istitle() and lemma(token.text) == token.text: if len(lexeme(token.text)) >= 2: newlist.append(lexeme(token.text)[1]) elif token.text == "have": newlist.append("has") else: newlist.append(token.text) return newlist
def infer_direction(sen, default): all_nw = [] all_pw = [] nw = ["decrease"] pw = ["increase"] # add in all words that we want for i in range(len(nw)): neg_words = list( reduce((lambda y, x: np.append(y, x.lemma_names())), wordnet.synsets(nw[i]), [])) pos_words = list( reduce((lambda y, x: np.append(y, x.lemma_names())), wordnet.synsets(pw[i]), [])) all_nw.extend(neg_words) all_pw.extend(pos_words) # add in different forms of the word based on english rules try: all_nw = list( reduce((lambda y, x: np.append(y, lexeme(x))), all_nw, [])) all_pw = list( reduce((lambda y, x: np.append(y, lexeme(x))), all_pw, [])) except: print("Error. Continue.") return infer_direction(sen, default) # remove duplicates all_nw = [x for x in iter(set(all_nw))] all_pw = [x for x in iter(set(all_pw))] neg = 0 pos = 0 # number of positive words for word in all_nw: if word in sen: neg += 1 # number of negative words for word in all_pw: if word in sen: pos += 1 if (pos > neg): return "Significantly increased" elif (neg > pos): return "Significantly decreased" else: return default
def getSynonyms(word, part): synonyms = [] wordToTry = lemma(word) if part[0] == 'V' else word synList = dictionary.synonym(wordToTry) if synList is None: return [word] for syn in synList: if " " not in syn: if part == "VB" or part == "VBP": synonyms.append(lemma(syn)) elif part == "VBD" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[3]) elif part == "VBG" and len(lexeme(syn)) > 0: synonyms.append(lexeme(syn)[0]) elif part == "VBN" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[-1]) elif part == "VBZ" and len(lexeme(syn)) > 1: synonyms.append(lexeme(syn)[1]) elif part == "NN" and syn[-2:] != "ss": synonyms.append(singularize(syn)) elif part == "NNS": synonyms.append(pluralize(syn)) else: synonyms.append(syn) return list(set(synonyms))
def fix_caption(str): s = parsetree(str, lemmata=True) string = '' for sentence in s: if "and a" in str: string = str+' ' else: for i, chunk in enumerate(sentence.chunks): if chunk.type == 'VP' and len(chunk) == 2: verb = chunk[1].string string += lexeme(verb)[1]+' ' else: for j, w in enumerate(chunk.words): if i == 0 and j == 0 and (w.string == 'a' or w.string == 'A'): print('chuk', chunk) pass else: string = string + w.string+' ' string = string[:1].upper() + string[1:-1] if string.startswith('A'): string = string[2].upper() + string[3:] if string.endswith('.'): string = string[:-1] return string
def perturb(self, word, tag): res = "" # pertube verb if 'V' in tag: vs = pe.lexeme(word) res = choice(vs) while (res == word or len(res) > len(word)) and (vs[0] != word): res = choice(vs) if vs[0] == word: res = vs[1] #pertube plural/singlar noun if 'NNS' == tag: res = pe.singularize(word) if res == word: res = word[:-1] if len(res) > 0: return (res, word, (0, len(res))) else: #if the perturbed result is empty, we just randomly remove some chars in the word removeLen = randint(1, min(len(word) - 1, 3)) lenw = len(word) removestart = lenw - removeLen return (word[:removestart] + word[removestart + removeLen:], word, (0, lenw - removeLen))
def procesar_ejercicio_verbos(self, texto): parrafos = texto.split('\n') posicion_inicial = 0 cant_verbos = 0 texto_ejercicio = [] items_ejercicio = [] for parrafo in parrafos: tokens = nltk.word_tokenize(parrafo) lista_verbos = vb.obtener_verbos(parrafo) for idx, verbo in enumerate(lista_verbos): conjugaciones = lexeme(verbo['token']) conjugaciones = vb.filtrar_conjugaciones(verbo, conjugaciones) tiempo_verbal = vb.obtener_tiempo(verbo['pos_tag']) item = ItemEjercicioVerbos( verbo['token'], conjugaciones, str(idx + cant_verbos), verbo['posicion'] + posicion_inicial, tiempo_verbal) items_ejercicio.append(item) texto_ejercicio.append( orac.sustituir_verbos(tokens, lista_verbos, cant_verbos)) posicion_inicial = posicion_inicial + len(tokens) cant_verbos = cant_verbos + len(lista_verbos) ejercicio = { 'texto': '\n'.join(texto_ejercicio), 'items': items_ejercicio } return ejercicio
def process(s): # x=parse(s,tokenize=True,tags=True,chunks=True,encoding='utf-8') # lis=x.split(" ") lis = nltk.pos_tag(nltk.word_tokenize(s)) l = [] # print(lis) for ele in lis: y = ele[1] word = ele[0] # if word in WHfam: # l=l+[joinlist(WHfam)] # if word in Demons: # l=l+[joinlist(Demons)] # if word.lower() in Poss: # for xyz in Possessives: # if word.lower() in xyz: # l=l+[joinlist(xyz)] # break if y in Verbs: # print(word) if word in Aux: l = l + [joinlist(Aux)] else: # print("Hi") l = l + [joinlist(lexeme(word))] else: l = l + [word] return " ".join(l)
def conjugateVerbs(sentence, tense): """Use parse trees to identify the verbs in a phrase. Assume the first word in the phrase is guaranteed to be a verb. Return the phrase with each verb converted to the desired tense.""" if not sentence: return None """pattern-en's conjugation() often does not work, but lexeme() generates conjugations in a predictable order""" lexeme_indicies = {"infinitive": 0, "continuous": 2} t = lexeme_indicies[tense.lower()] words = en.parsetree(sentence)[0] words[0].string = en.lexeme(words[0].string)[t] for word in words: if word.type[0] == "V": word.string = en.lexeme(word.string)[t] return words.string
def getArticle(article): try: #chunks = gc.getChunks(article) tags = tag.getTags(article[1]) #if tags == []: try: # continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags[-1]['subject'] relation = tags[-1]['relation'] objects = tags[-1]['object'] objects = objects.split(' ') relations = [] relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation)) relations = en.lexeme(relations[0]) content = wp.getArticle(subject) except: # continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags[0]['subject'] relation = tags[0]['relation'] objects = tags[0]['object'] objects = objects.split(' ') relations = [] relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation)) relations = en.lexeme(relations[0]) content = wp.getArticle(subject) #objects = objects.split() rawSentences = nltk.tokenize.sent_tokenize(content)#sent.getSentences(content) sentences = [] for sentence in rawSentences: for word in objects: if word in sentence: sentences.append(sentence) for word in relations: if word in sentence: sentences.append(sentence) sentences = list(set(sentences)) return {'title':article[1], 'sentences':sentences, 'year':article[0]} except: return
def check_pos(pos_tag, word): if pos_tag == 'NN': add(pluralize(word), word, False, "plural") elif pos_tag == 'VB': for lex in lexeme(word): add(lex, word, False, "conjugation") elif pos_tag == 'JJ': comp = comparative(word) add(comp, word, False, "comparative") sup = superlative(word) add(sup, word, False, "superlative")
def regex_or_list_maker(verb_list): """makes a regex from the list of words passed to it""" # add alternative spellings from dictionaries.word_transforms import usa_convert from pattern.en import lexeme uk_convert = {v: k for k, v in usa_convert.items()} to_add_to_verb_list = [] for w in verb_list: if w in usa_convert.keys(): to_add_to_verb_list.append(usa_convert[w]) for w in verb_list: if w in uk_convert.keys(): to_add_to_verb_list.append(uk_convert[w]) verb_list = sorted(list(set(verb_list + to_add_to_verb_list))) verbforms = [] for w in verb_list: forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)] for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) t = [] # ensure unicode for w in verbforms: if type(w) != unicode: t.append(unicode(w, 'utf-8', errors = 'ignore')) else: t.append(w) verbforms = t if not regex: return verbforms else: return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
def mangle_agreement(correct_sentence): """Given a correct sentence, return a sentence or sentences with a subject verb agreement error""" # # Examples # # Back in the 1800s, people were much shorter and much stronger. # This sentence begins with the introductory phrase, 'back in the 1800s' # which means that it should have the past tense verb. Any other verb would # be incorrect. # # # Jack and jill went up the hill. # This sentence is different; 'go' would also be correct. If it began with # 'Yesterday', a single-word introductory phrase requiring no comma, only # 'went' would be acceptable. # # # The man in the checkered shirt danced his warrior dance to show that # he was the most dominant male in the room. # This sentence has multiple verbs. If the sentence ended at the word dance, # changing 'danced' to 'dances' would be acceptable, but since the sentence # continues we cannot make this change -- 'was' agrees with 'danced' but not # with 'dances'. This is a shifty tense error, a classic subject verb # agreement error. # # # Our Method # # Right now, we will assume that any change in verb form of a single verb in # a sentence is incorrect. As demonstrated above, this is not always true. # We hope that since any model created off of this data will use a # confidence interval to determine likelihood of a subject-verb agreement # error, that some number can be found for which the model excels. # # It would also be possible to use a rule based learner to evaluate single # verb sentences, and only evaluating more complex sentences with the # tensorflow model. bad_sents = [] doc = nlp(correct_sentence) verbs = [(i, v) for (i, v) in enumerate(doc) if v.tag_.startswith('VB')] for i, v in verbs: for alt_verb in lexeme(doc[i].text): if alt_verb == doc[i].text: continue # Same as the original, skip it if (tenses(alt_verb) == tenses(v.text) or (alt_verb.startswith(v.text) and alt_verb.endswith("n't"))): continue # Negated version of the original, skip it new_sent = str(doc[:i]) + " {} ".format(alt_verb) + str( doc[i + 1:]) new_sent = new_sent.replace(' ,', ',') # fix space before comma bad_sents.append(new_sent) return bad_sents
def inflected_forms(self, syn, desc): try: word, pos, _ = desc if pos == 'Verb': from pattern.en import lexeme return [w for w in reversed(lexeme(word)) if w != word] elif pos == 'Noun': from pattern.en import pluralize return [pluralize(word)] elif pos == 'Adjective': from pattern.en import comparative, superlative return [comparative(word), superlative(word)] else: return [] except ImportError: raise MessageException('General', 'unavailable', 'WordData[_, "InflectedForms"]', 'pattern')
def fix_caption(str): s = parsetree(str, lemmata=True) string = '' for sentence in s: for i, chunk in enumerate(sentence.chunks): if chunk.type == 'VP' and len(chunk) == 2: print('@!!') verb = chunk[1].string string += lexeme(verb)[1] + ' ' else: for j, w in enumerate(chunk.words): if i == 0 and j == 0: pass else: string = string + w.string + ' ' return string[:1].upper() + string[1:-1]
def get_ngram(words, word_pos): #從片段取出ngram ngrams = [] for n in range(2,6): for i in range(len(words)-n+1): for w in words[i:i+n]: if word_pos[w] in content_Words: #只取含有實詞的ngram if word_pos[w] == 'V': tense = lexeme(w) #List of possible verb forms: be => is, was, been... for verb in tense: Slice = words[i:i+n] Slice[Slice.index(w)] = verb ngrams.append(Slice) else: ngrams.append(words[i:i+n]) continue #print ngrams return ngrams
def tenseChecker(sentence, ind): #sentence is a list of words, ind = index of verb trigrams = [] fourgrams = [] fourgram_freq = {} b = len(sentence[ind - 1:ind + 3]) >= 4 if b: fourgrams = getFourgram(sentence, ind) trigram_freq = {} trigrams = getTrigrams(sentence, ind) # print(trigrams) for tri, i in trigrams: key = tri[i] temp = tri # print(tri) word_tenses = lexeme(key) # print(word_tenses) for tense in word_tenses: temp[i] = tense # print(temp) if tense in trigram_freq.keys(): trigram_freq[tense] += trigramFreq(temp) # print(trigramFreq(temp)) else: trigram_freq[tense] = trigramFreq(temp) sorted_tenses = sorted(trigram_freq.items(), key=lambda x: x[1], reverse=True) for tense in word_tenses: temp = fourgrams[0] temp[fourgrams[1]] = tense if tense in fourgram_freq.keys(): fourgram_freq[tense] += trigramFreq(temp) # print(trigramFreq(temp)) else: fourgram_freq[tense] = trigramFreq(temp) sorted_tenses_4 = sorted(fourgram_freq.items(), key=lambda x: x[1], reverse=True) print(sorted_tenses) print(sorted_tenses_4)
def keywordChecker(book, genres): genre_scores = {} for each in genres.keys(): genre_scores[each] = [] for genre, words in genres.items(): found = [] #words is list of words in each genre for wd in words: #for each word in a list word = wd[0] #wd[1] is the point value of that word for a in re.finditer(word, book['description']): genre_scores[genre].append(wd[1]) if a.group(0) not in found: found.append(a.group(0)) #don't append duplicates because that's a waste of space. # There can also be other forms of a root word. For example, 'fought' is a form of 'fight'. # Use the NLP module pattern.en to call lexeme(word) to get the list of possible forms of that word. Works for 2 word keywords too allWords = lexeme(word) for each in allWords: # For the word 'fight', for example, the part where the word is being searched directly in the book, takes care of cases such as 'fight', 'fights', 'fighting', etc words that start with 'fight'. # This part where word forms are being searched, looks are cases such as 'fought' which are forms or 'fight' but don't start with 'fight' if word not in each: #don't search the forms that have been covered ('fight', 'fighting', etc) for b in re.finditer(each, book['description']): genre_scores[genre].append(wd[1]) if b.group(0) not in found: found.append(b.group(0)) #don't append duplicates because that's a waste of space. # if each in book['description']: # genre_scores[genre].append(wd[1]) # found.append(each) # now, calculate score averages # save score averages as last value in dict for each genre for k,v in genre_scores.items(): if len(v) != 0: genre_scores[k].append(len(v) * int(sum(v)/len(v))) return genre_scores
def get_synomyms_token(token): stem = stemmer.stem(token) synonyms_ = [token] if stem in stem2words: words = stem2words[stem] synonyms_.extend(words) w1 = lemmatizer.lemmatize(token, 'v') w2 = lemmatizer.lemmatize(token, pos="a") w3 = lemmatizer.lemmatize(token) w = {w1, w2, w3} synonyms_.extend(list(w)) #synonyms_ = [token] for syn in wordnet.synsets(token): for l in syn.lemmas(): synonyms_.append(l.name()) synonyms_.extend(lexeme(token)) synonyms = np.array([elm for elm in set(synonyms_)]) return synonyms
def get_questions(self): z = self.getText() (subj,vp) = (z['NP'][0], z['VP'][0]) from pattern.en import lexeme, lemma, tenses import nltk, re tagged = nltk.pos_tag(nltk.word_tokenize(subj + " " + vp)) verb = "" sense = supersense(subj) if(sense[0][2][-6:] == 'person' or sense[0][1] == 'PRP'): return ("Who " + vp + "?") elif(sense[0][2][-4:] == 'time' or re.match("[1|2]\d\d\d", subj)): return ("When " + vp + "?") elif(sense[0][2][-8:] == 'location' and ('PP' in z and z['PP'].split()[0].lower in ["on", "in", "at", "over", "to"])): return ("Where " + vp + "?") aux = ["Will","Shall","May","Might","Can","Could","Must","Should","Would","Do","Does","Did"] for i in reversed(tagged): if(i[1][0] == 'V'): verb = i[0] if((u'' + verb) in lexeme("is")): return (verb.capitalize() + " " + subj.lower() + vp[len(verb):] + "?") else: for x in aux: if(tenses(x)[0] == tenses(verb)[0]): return (x + " " + subj.lower() + " " + lemma(verb) + vp[len(verb):] + "?")
def make_morph_set(lemma, pos): if pos == 'n': return set([lemma, en.pluralize(lemma)]) elif pos == 'v': m = set(en.lexeme(lemma)) m.add(lemma) return m elif pos == 'a': m = set([lemma]) c = en.comparative(lemma) if c and not c.startswith('more '): m.add(c) s = en.superlative(lemma) if s and not s.startswith('most '): m.add(s) return m else: return set([lemma])
def check_gf1(gt_sent, recon_sent): ### Check for verb-object tuple; heuristic of matching singluar version and ### Same first 3 letters for the verb as shortest verb is of length 3 e.g. Hug :) gt_verb = gt_sent[-3] verbs = lexeme(gt_verb) ###returns a list of conjug. verbs # print("gt sent is:", gt_sent) # print("recon sent is:", recon_sent) # print("list of gt verb alternatives:",verbs) gt_obj = gt_sent[-1] # print("gt obj is:",gt_obj) # print("last word in recon sent:",recon_sent[-1]) # print("gt verb is:", gt_verb) objs = [singularize(gt_obj), pluralize(gt_obj) ] ###return a list with pos1 sing.obj and pos2 pl.obj. score = 0.0 if any(i in verbs for i in recon_sent): ### Reward learning the verb score = score + 0.5 if any(i in objs for i in recon_sent): ### Reward learning the object score = score + 0.5 return score
print # COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() commands give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print word, "=>", comparative(word), "=>", superlative(word) print print # VERB CONJUGATION # ---------------- # The lexeme() command returns a list of all possible verb inflections. # The lemma() command returns the base form (infinitive) of a verb. print "lexeme:", lexeme("be") print "lemma:", lemma("was") # The conjugate() command inflects a verb to another tense. # The tense can be given as a constant, e.g. # INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ... # or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. print conjugate("being", tense="1sg", negated=False) # Prefer the full constants for code that will be reused/shared. # The tenses() command returns a list of all tenses for the given verb form. # For example: tenses("are") => ['present 2nd person singular', 'present plural'] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT_PLURAL
import io from google.cloud import vision import re import nltk from nltk import sent_tokenize, word_tokenize from nltk.corpus import wordnet as wn from pattern.en import conjugate, lemma, lexeme, PRESENT, SG import math answer = ['i', 'am', 'eat', 'eating', 'in', 'a', 'mine', 'mined'] keywords = ['eat', 'mine', 'shine'] extended_keywords = [] keywords_matched = 0 for word in keywords: temp_list = [] temp_list.extend(lexeme(word)) for happy_lemma in wn.lemmas(word): #for each "happy" lemma in WordNet temp_list.append(happy_lemma.name()) #add the lemma itself for related_lemma in happy_lemma.derivationally_related_forms( ): #for each related lemma temp_list.append(related_lemma.name()) #add the related lemma temp_list = list(set(temp_list)) extended_keywords.append((word, temp_list)) keywords_dictionary = {key: value for (key, value) in extended_keywords} # print(keywords_dictionary) for (keyword, keyword_list) in keywords_dictionary.items(): for word in keyword_list: if word in answer: keywords_matched = keywords_matched + 1
def verb_vocab(tcm = None, postagger = None, min_length=0): """ Return all verbs found in wordnet in various inflected forms. """ if not postagger: postagger = BackoffTagger.from_pickle() getpostag = lambda word : postagger.tag([word])[0][1] # Most of the time lexeme() returns 4 or 5 words, inflected as declared below # To avoid assumptions on the tagset used, we query the tags using easy examples # (verb give). These POS tags are then bound to lexeme's results. infinitive_pos = getpostag("give") present_pos = getpostag("gives") pres_prog_pos = getpostag("giving") past_pos = getpostag("gave") past_prog_pos = getpostag("given") # three possibilities for return of function tenses # depending on how many variations a verb has tenses3 = [infinitive_pos, present_pos, pres_prog_pos] tenses4 = tenses3 + [past_pos] tenses5 = tenses4 + [past_prog_pos] verbs = set() for lemma in wn.all_lemma_names(pos = 'v'): if len(lemma) < min_length: continue if '_' in lemma: continue forms = lexeme(lemma) # all possible conjugations of this verb (lemma) if len(forms) == 3: forms = zip(forms, tenses3) elif len(forms) == 4: forms = zip(forms, tenses4) elif len(forms) == 5: forms = zip(forms, tenses5) else: # this step can introduce errors, as getpostag isn't # guaranteed to return a verb tag forms = [(form, getpostag(form)) for form in forms] # ignore forms that do not map back to lemma by wordnet's # lemmatizer, as they are likely erroneous forms = list(filter(lambda form: lemma in wn._morphy(form[0], 'v'), forms)) if tcm is not None: classes = [classy for syn in wn.synsets(lemma, 'v') for classy in tcm.predict(syn)] else: classes = [syn.name() for syn in wn.synsets(lemma, 'v')] for classy in classes: for form, postag in forms: if not postag: log.warning("{} has POS==None".format(form)) continue if postag[0] == 'n': # dirty hack to avoid inconsistency introduced by postagger continue verbs.add((form, postag, classy)) if "'" in form: # remove ' (couldn't -> couldnt) verbs.add((form.replace("'", ""), postag, classy)) return verbs
print() # COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() functions give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print(word, "=>", comparative(word), "=>", superlative(word)) print() print() # VERB CONJUGATION # ---------------- # The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print("lexeme:", lexeme("be")) print("lemma:", lemma("was")) print() # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)) print(conjugate("being", tense="1sg", negated=False))
print(singularize("our", pos=ADJECTIVE)) print("") # COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() functions give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print("%s => %s => %s" % (word, comparative(word), superlative(word))) print("") # VERB CONJUGATION # ---------------- # The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print("lexeme: %s" % lexeme("be")) print("lemma: %s" % lemma("was")) print("") # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)) print(conjugate("being", tense="1sg", negated=False))
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG print(lexeme('gave'))
print(referenced('university')) print(referenced('hour')) from pattern.en import pluralize, singularize print(pluralize('child')) print(singularize('wolves')) from pattern.en import comparative, superlative print(comparative('bad')) print(superlative('bad')) from pattern.en import conjugate, lemma, lexeme print(lexeme('purr')) print(lemma('purring')) print(conjugate('purred', '3sg')) # he / she / it from pattern.en import conjugate, lemma, lexeme print(lexeme('purr')) print(lemma('purring')) print(conjugate('purred', '3sg')) # he / she / it from pattern.de import gender, MALE, FEMALE, NEUTRAL print(gender('Katze')) print(gender('Mesa')) from pattern.en import verbs, conjugate, PARTICIPLE
#Indefinite article print article('university') print article('hour') print referenced('university') print referenced('hour') #singularity print pluralize('child') print singularize('wolves') # print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print (PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2)
def get_marks(data, image_file): keywords_matched = 0 #maximum_marks = 5 maximum_marks = data[0] keywords = [] keywords = data[3].split(',') # keywords=['data','mine','database','characterization','knowledge','background','task','classify','associate','visualize','predict','cluster'] expected_keywords = len(keywords) #expected_no_of_words = 200 expected_no_of_words = data[1] #expected_no_of_sentences = 15 expected_no_of_sentences = data[2] # extended_keywords = [] # for word in keywords: # for syn in wn.synsets(word): # for l in syn.lemmas(): # extended_keywords.append(l.name()) forms = [ ] #We'll store the derivational forms in a set to eliminate duplicates for word in keywords: for happy_lemma in wn.lemmas(word): #for each "happy" lemma in WordNet forms.append(happy_lemma.name()) #add the lemma itself for related_lemma in happy_lemma.derivationally_related_forms( ): #for each related lemma forms.append(related_lemma.name()) #add the related lemma verb = [] for word in keywords: verb.extend(lexeme(word)) # keywords.extend(extended_keywords) keywords.extend(forms) keywords.extend(verb) keywords = [x.lower() for x in keywords] keywords = list(set(keywords)) # print(keywords) with io.open(image_file, 'rb') as image_file: content = image_file.read() image = vision.types.Image(content=content) response = client.text_detection(image=image) texts = response.text_annotations string = texts[0].description.replace( '\n', ' ').lower() #for converting to lower case string = re.sub('[^A-Za-z0-9.]+', ' ', string) #for eliminating special character print string word_list = word_tokenize(string) #for word spliting no_of_words = len(word_list) if no_of_words > expected_no_of_words: no_of_words = expected_no_of_words no_of_sentences = len(sent_tokenize(string)) if no_of_sentences > expected_no_of_sentences: no_of_sentences = expected_no_of_sentences print 'no_of_words', no_of_words print 'no_of_sentences', no_of_sentences for keyword in keywords: if (keyword in word_list): keywords_matched = keywords_matched + 1 if keywords_matched > expected_keywords: keywords_matched = expected_keywords print 'keywords matched', keywords_matched keywords_percentage = 0.55 * (keywords_matched / expected_keywords) word_percentage = 0.35 * (no_of_words / expected_no_of_words) sentence_percentage = 0.10 * (no_of_sentences / expected_no_of_sentences) print 'keywords_percentage', keywords_percentage print 'word_percentage', word_percentage print 'sentence_percentage', sentence_percentage total_marks = maximum_marks * (keywords_percentage + word_percentage + sentence_percentage) total_marks = round(total_marks, 1) digit = total_marks * 10 if (digit % 10 < 5): total_marks = math.floor(total_marks) if (digit % 10 > 5): total_marks = math.ceil(total_marks) print 'total_marks', total_marks return total_marks
print # COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() functions give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print word, "=>", comparative(word), "=>", superlative(word) print print # VERB CONJUGATION # ---------------- # The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print "lexeme:", lexeme("be") print "lemma:", lemma("was") print # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print conjugate("being", tense=PRESENT,
def get_verb_tense_forms(self, verb): all_tense_forms = lexeme(verb) return all_tense_forms
inflections.intersection_update(wordlist) print(inflections) except: pass # print("Unexpected error") return inflections keyfile = open("english_inflections.txt", "w", encoding='utf-8') try: print(lexeme('be')) except: pass print(lexeme('be')) print(lexeme('conclusion')) print(lexeme('harlot')) print(pluralize('be')) print(pluralize('conclusion')) print(pluralize('harlot')) for k in keys: # print(lexeme(k)) # print(pluralize(k))
<mbp:frameset> <mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0"> <div align="center" bgcolor="yellow"/> <a onclick="index_search()">Dictionary Search</a> </div> </mbp:slave-frame> <mbp:pagebreak/> """) dt, dd = r.split('\t',1) if not UTFINDEX: dt = normalizeUnicode(dt,'cp1252') dd = normalizeUnicode(dd,'cp1252') dtstrip = normalizeUnicode( dt ).strip() dd = dd.replace("\\\\","\\").replace("\\n","<br/>\n") forms = Set(lexeme(dt)) forms.add(pluralize(dt)) toremove = Set() for w in forms: if w not in wordlist: # print("Remove %s" % w) toremove.add(w) removed += len(toremove) forms.difference_update(toremove) inflections = '' if len(forms): inflections = '\t\t\t<idx:infl>\n' for f in forms:
<mbp:frameset> <mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0"> <div align="center" bgcolor="yellow"/> <a onclick="index_search()">Dictionary Search</a> </div> </mbp:slave-frame> <mbp:pagebreak/> """) dt, dd = r.split('\t', 1) if not UTFINDEX: dt = normalizeUnicode(dt, 'cp1252') dd = normalizeUnicode(dd, 'cp1252') dtstrip = normalizeUnicode(dt).strip() dd = dd.replace("\\\\", "\\").replace("\\n", "<br/>\n") forms = Set(lexeme(dt)) forms.add(pluralize(dt)) toremove = Set() for w in forms: if w not in wordlist: # print("Remove %s" % w) toremove.add(w) removed += len(toremove) forms.difference_update(toremove) inflections = '' if len(forms): inflections = '\t\t\t<idx:infl>\n' for f in forms:
if options.m2_raw == '' or options.pos == '': sys.stderr.write("Usage: python make_verbforms.py -l [gec lemma file] -f [gec raw file] -p [pos file] > [output file]\n") exit(1) else: pass pos_lines = codecs.open(options.pos, 'r', 'utf8').readlines() sent_lines = codecs.open(options.m2_raw, 'r', 'utf8').readlines() lemma_lines = codecs.open(options.lemma_file, 'r', 'utf8').readlines() verbforms = {} for lemma_line, pos_line, sent_line in zip(lemma_lines, pos_lines, sent_lines): for lem, pos, word in zip(lemma_line.strip().split(),pos_line.strip().split(), sent_line.strip().split()): word = word.lower() word_caps_all = word.upper() word_caps = word.capitalize() if word in verbforms: continue if pos.startswith('VB'): vforms = [] lem = lem.lower() vforms = lexeme(lem) vforms = get_conjugations(lem) #vforms = [c for c in vforms if (("n't" not in c) and (len(c.split())==1) and d.check(c))] for v_aspect in vforms: v,aspect = v_aspect.split('|||') verbforms[v] = vforms else: pass # not a noun or noun_pl for k, v in sorted(verbforms.iteritems()): print k, ' '.join(v)