示例#1
1
def assembleEntry(y):
    glosses = []
    examples = []
    etymologies = []
    quotations = []
    pronunciations = []
    pronunciation_entries = set();
    partsOfSpeech = []
    partsOfSpeechHeads = []
    etymology_entries = set();
    synonyms = []
    word_forms = []

    # Preprocessing
    for entry in y.get('entries', []):
        # Parts of speech
        psos = entry.get('partsOfSpeech') or []
        try:
            psos = map(lambda x: x.replace('proper_noun', 'proper noun'), psos)
        except:
            print(repr(psos))
            print(y['title'])
            raise
        if psos:
            partsOfSpeech.append(u"<B>" + u" ,".join(psos) + u"</B>")
            partsOfSpeechHeads.append(psos[0])
        else:
            partsOfSpeech.append("")
            partsOfSpeechHeads.append("")

        # Word forms
        elems = []
        for wf in entry.get('wordForms') or []:
            form = wf.get('form')
            if form:
                elems.append(form)
        word_forms.append(elems)

        # Synonyms
        synonyms.append(clean_synonyms(entry.get('synonyms', [])))

        # Pronunciations
        elems = []
        elem = ""
        # print(entry.get('pronunciations', []))
        for pronunciation in entry.get('pronunciations', []):
            text = pronunciation.get('text')
            if text:
                if text not in pronunciation_entries:
                    pronunciation_entries.add(text)
                    elem += text
                    note = pronunciation.get('note')
                    if note:
                        elem += " (" + note + ")"
                    elems.append(elem)
                    elem = ""
        pronunciations.append(", ".join(elems))
        # print(repr(pronunciations[-1]))

        # Senses
        gloss_entry = []
        example_entry = []
        quote_entry = []
        for sense in entry.get('senses') or []:
            gloss_entry.append(stripHtml(sense.get('gloss', '')))
            example_entry.append([ replace_newlines(stripHtml(example.get('example', ''))) for example in sense.get('examples', [])])
            quote_entry.append([ replace_newlines(stripHtml(quote.get('quote', ''))) for quote in sense.get('quotations', [])])
        glosses.append(gloss_entry)
        examples.append(example_entry)
        quotations.append(quote_entry)

        etymology_text = stripHtml(entry.get('etymology', ''))
        if etymology_text not in etymology_entries:
            etymology_entries.add(etymology_text)
            etymologies.append(etymology_text)
        else:
            etymologies.append('')

    # Assemble string

    # Title
    s = u""
    # s += y['title'] + "\t"

    # Pronunciations
    entry_pronuncs = False
    # pronunciations_filtered = [text for entry in pronunciations for text in entry]
    pronunciations_filtered = list(filter(None, pronunciations))
    if len(pronunciations_filtered) == 1:
        s += u" " + pronunciations_filtered[0] + "<BR>"
    else:
        entry_pronuncs = True

    # Entries & glosses
    single_entry = len(glosses) == 1
    for (entry_num, entry_glosses) in enumerate(glosses, 1):
        if entry_num >= 2:
            s += "<BR>"
        if not single_entry:
            s +=u"{0}. ".format(roman.int_to_roman(entry_num))
        if entry_pronuncs:
            s += prep_string(pronunciations[entry_num - 1])
        s += partsOfSpeech[entry_num - 1]

        # Handle word forms
        pos = partsOfSpeechHeads[entry_num - 1]
        word = y['title']
        if pos == "verb":
            p = en.conjugate(word, 'p')
            pp = en.conjugate(word, 'ppart')
            if p != word + 'ed' or pp != word + 'ed':
                s += u" (p. " + p + u", pp. " + pp + u")"
        elif pos == "noun":
            pl = en.pluralize(word)
            if pl != word + u's':
                s += u" (pl. " + pl + ")"
        elif pos == "adjective":
            pass

        # Glosses
        single_gloss = len(entry_glosses) == 1
        for (gloss_num, gloss) in enumerate(entry_glosses, 1):
            if not single_gloss:
                s += u" {0:d}.".format(gloss_num)
            # else:
            #     s += u":"
            s += u" {0}".format(gloss)
        s += prep_string(", ".join(synonyms[entry_num - 1]) + u"." if synonyms[entry_num - 1] else "", " Synonyms: ")
        # s += prep_string(etymologies[entry_num - 1], u" Etymology: ")

    # Etymologies
    etymologies_filtered = [etym for etym in etymologies if etym]
    if etymologies_filtered:
        s += '<BR><BR><B>Etymology:</B>'
        if len(etymologies_filtered) == 1:
            s += etymologies_filtered[0]
        else:
            for i in range(0, len(glosses)):
                if etymologies[i]:
                    s += u" {0}. {1}".format(roman.int_to_roman(i + 1), etymologies[i])

    # Examples and Quotes
    examples_flat = [example for entry in examples for examples in entry for example in examples if example]
    if examples_flat:
        s += u"<BR><BR><B>Examples:</B>"
        for (num_example, example) in enumerate(examples_flat, 1):
            if len(examples_flat) == 1:
                s += " " + example
            else:
                s += u" {0:d}. {1}".format(num_example, example)

    quotes_flat = [quote for entry in quotations for quotes in entry for quote in quotes if quote]
    if quotes_flat:
        s += u"<BR><BR><B>Quotations:</B>"
        for (num_quote, quote) in enumerate(quotes_flat, 1):
            if len(quotes_flat) == 1:
                s += u" " + quote
            else:
                s += u" {0:d}. {1}".format(num_quote, quote)

    s = escape_characters(s)

    word_forms_flat = [form for entry in word_forms for form in entry if form]
    titles = [y['title']]
    titles.extend(word_forms_flat)
    if 'verb' in partsOfSpeechHeads:
        titles.extend(en.lexeme(y['title']))
    if 'noun' in partsOfSpeechHeads:
        titles.append(en.pluralize(y['title']))
    if 'adjective' in partsOfSpeechHeads:
        adj_forms = [en.comparative(y['title']), en.superlative(y['title'])]
        adj_forms = [form for form in adj_forms if len(form.split(' ')) == 1]
        titles.extend(adj_forms)
    titles = unique(titles)

    if s.strip() == "":
        s = "Empty article."
    s = u'|'.join(titles) + u"\n" + s.strip()

    # return escape_characters(contract_tabs(s))
    return s
示例#2
0
def getInflections(key):

    inflections = set()
    # print('"%s"' % key)

    if key.isalpha():

        try:
            try:
                lexeme(key)
            except:
                pass

            inflections.add(lexeme(key))  # get all lexem inflections of words
            inflections.add(pluralize(key))  # add plural inflections

            inflections.intersection_update(wordlist)

            print(inflections)

        except:
            pass
            # print("Unexpected error")

    return inflections
示例#3
0
def testBasic():
    from pattern.en import referenced
    print referenced('hour')
    
    from pattern.en import conjugate, lemma, lexeme
    print lexeme('purr')
    print lemma('purring')
    print conjugate('purred', '3sg') # he / she / it
示例#4
0
def testBasic():
    from pattern.en import referenced
    print referenced('hour')

    from pattern.en import conjugate, lemma, lexeme
    print lexeme('purr')
    print lemma('purring')
    print conjugate('purred', '3sg')  # he / she / it
def tenseChecker(sentence,ind):#sentence is a list of words, ind = index of verb
	global threads
	trigrams=[]
	fourgrams=[]
	fourgrams=getFourgrams(sentence,ind)
	# print("OOOOKKK")
	# print(fourgrams)
	if fourgrams:
		# print("FOUR")
		for four,i in fourgrams:
			key = four[i]
			temp = four
		# print(tri)
			word_tenses=lexeme(key)
		# print(word_tenses)
			for tense in word_tenses:
				temp[i]=tense
			# print(temp)
				t = threading.Thread(target = threadOutput, args = (temp,tense,False, ))
				# t.setDaemon(True)
				t.start()
				threads.append(t)

		for t in threads:
			t.join()

		sorted_tenses_4 = sorted(fourgram_freq.items(), key = lambda x:x[1], reverse=True)
		# print(sorted_tenses_4)
		return sorted_tenses_4

	else:
		# trigram_freq1={}
		# print("THREE")
		trigrams=getTrigrams(sentence,ind)
	# print(trigrams)
		for tri,i in trigrams:
			key = tri[i]
			temp = tri
		# print(tri)
			word_tenses=lexeme(key)
		# print(word_tenses)
			for tense in word_tenses:
				temp[i]=tense
			# print(temp)
				t = threading.Thread(target = threadOutput, args = (temp,tense,True, ))
				# t.setDaemon(True)
				t.start()
				threads.append(t)
				
		for t in threads:
			t.join()

		sorted_tenses = sorted(trigram_freq1.items(), key = lambda x:x[1], reverse=True)
		# print(sorted_tenses)
		return sorted_tenses
示例#6
0
 def test_lexeme(self):
     # Assert all inflections of "be".
     v = en.lexeme("be")
     self.assertEqual(v, [
         "be", "am", "are", "is", "being", "was", "were", "been", "am not",
         "aren't", "isn't", "wasn't", "weren't"
     ])
     v = en.lexeme("imaginerify")
     self.assertEqual(v, [
         "imaginerify", "imaginerifies", "imaginerifying", "imaginerified"
     ])
     print "pattern.en.inflect.lexeme()"
示例#7
0
def verbs_to_does(textlist):
    newlist = []
    text = join_temp_text(textlist)
    # change to incorporate pattern also?
    doc = nlp(text) 
    for idx, token in enumerate(doc):
        # print(token, token.pos_)
        if token.pos_ == 'VERB' and ((idx != 0 and token.nbor(-1).text not in ["to","are","is"] and not token.nbor(-1).pos_ == "DET") or (idx == 0)) and not token.text.istitle() and lemma(token.text) == token.text:
            if len(lexeme(token.text)) >= 2: newlist.append(lexeme(token.text)[1])
        elif token.text == "have": newlist.append("has")
        else:
            newlist.append(token.text)
    return newlist
示例#8
0
 def test_lexeme(self):
     # Assert all inflections of "be".
     v = en.lexeme("be")
     self.assertEqual(v, [
         "be", "am", "are", "is", "being", 
         "was", "were", "been", 
         "am not", "aren't", "isn't", "wasn't", "weren't"
     ])
     v = en.lexeme("imaginerify")
     self.assertEqual(v, [
         "imaginerify", "imaginerifies", "imaginerifying", "imaginerified"
     ])
     print "pattern.en.inflect.lexeme()"
示例#9
0
def infer_direction(sen, default):
    all_nw = []
    all_pw = []

    nw = ["decrease"]
    pw = ["increase"]

    # add in all words that we want
    for i in range(len(nw)):
        neg_words = list(
            reduce((lambda y, x: np.append(y, x.lemma_names())),
                   wordnet.synsets(nw[i]), []))
        pos_words = list(
            reduce((lambda y, x: np.append(y, x.lemma_names())),
                   wordnet.synsets(pw[i]), []))
        all_nw.extend(neg_words)
        all_pw.extend(pos_words)

    # add in different forms of the word based on english rules
    try:
        all_nw = list(
            reduce((lambda y, x: np.append(y, lexeme(x))), all_nw, []))
        all_pw = list(
            reduce((lambda y, x: np.append(y, lexeme(x))), all_pw, []))
    except:
        print("Error. Continue.")
        return infer_direction(sen, default)

    # remove duplicates
    all_nw = [x for x in iter(set(all_nw))]
    all_pw = [x for x in iter(set(all_pw))]

    neg = 0
    pos = 0

    # number of positive words
    for word in all_nw:
        if word in sen:
            neg += 1

    # number of negative words
    for word in all_pw:
        if word in sen:
            pos += 1

    if (pos > neg):
        return "Significantly increased"
    elif (neg > pos):
        return "Significantly decreased"
    else:
        return default
def getSynonyms(word, part):
    synonyms = []
    wordToTry = lemma(word) if part[0] == 'V' else word
    synList = dictionary.synonym(wordToTry)
    if synList is None:
        return [word]
    for syn in synList:
        if " " not in syn:
            if part == "VB" or part == "VBP":
                synonyms.append(lemma(syn))
            elif part == "VBD" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[3])
            elif part == "VBG" and len(lexeme(syn)) > 0:
                synonyms.append(lexeme(syn)[0])
            elif part == "VBN" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[-1])
            elif part == "VBZ" and len(lexeme(syn)) > 1:
                synonyms.append(lexeme(syn)[1])
            elif part == "NN" and syn[-2:] != "ss":
                synonyms.append(singularize(syn))
            elif part == "NNS":
                synonyms.append(pluralize(syn))
            else:
                synonyms.append(syn)
    return list(set(synonyms))
示例#11
0
def fix_caption(str):
    s = parsetree(str, lemmata=True)
    string = ''
    for sentence in s:
        if "and a" in str:
            string = str+' '
        else:
            for i, chunk in enumerate(sentence.chunks):
                if chunk.type == 'VP' and len(chunk) == 2:
                    verb = chunk[1].string
                    string += lexeme(verb)[1]+' '
                else:
                    for j, w in enumerate(chunk.words):
                        if i == 0 and j == 0 and (w.string == 'a' or w.string == 'A'):
                            print('chuk', chunk)
                            pass
                        else:
                            string = string + w.string+' '

    string = string[:1].upper() + string[1:-1]
    if string.startswith('A'):
      string = string[2].upper() + string[3:]
    if string.endswith('.'):
      string = string[:-1]
    return string
示例#12
0
    def perturb(self, word, tag):
        res = ""
        # pertube verb
        if 'V' in tag:
            vs = pe.lexeme(word)
            res = choice(vs)

            while (res == word or len(res) > len(word)) and (vs[0] != word):
                res = choice(vs)
            if vs[0] == word:
                res = vs[1]

        #pertube plural/singlar noun
        if 'NNS' == tag:
            res = pe.singularize(word)
            if res == word:
                res = word[:-1]

        if len(res) > 0:
            return (res, word, (0, len(res)))
        else:
            #if the perturbed result is empty, we just randomly remove some chars in the word
            removeLen = randint(1, min(len(word) - 1, 3))
            lenw = len(word)
            removestart = lenw - removeLen
            return (word[:removestart] + word[removestart + removeLen:], word,
                    (0, lenw - removeLen))
示例#13
0
 def procesar_ejercicio_verbos(self, texto):
     parrafos = texto.split('\n')
     posicion_inicial = 0
     cant_verbos = 0
     texto_ejercicio = []
     items_ejercicio = []
     for parrafo in parrafos:
         tokens = nltk.word_tokenize(parrafo)
         lista_verbos = vb.obtener_verbos(parrafo)
         for idx, verbo in enumerate(lista_verbos):
             conjugaciones = lexeme(verbo['token'])
             conjugaciones = vb.filtrar_conjugaciones(verbo, conjugaciones)
             tiempo_verbal = vb.obtener_tiempo(verbo['pos_tag'])
             item = ItemEjercicioVerbos(
                 verbo['token'], conjugaciones, str(idx + cant_verbos),
                 verbo['posicion'] + posicion_inicial, tiempo_verbal)
             items_ejercicio.append(item)
         texto_ejercicio.append(
             orac.sustituir_verbos(tokens, lista_verbos, cant_verbos))
         posicion_inicial = posicion_inicial + len(tokens)
         cant_verbos = cant_verbos + len(lista_verbos)
     ejercicio = {
         'texto': '\n'.join(texto_ejercicio),
         'items': items_ejercicio
     }
     return ejercicio
示例#14
0
def process(s):
    # x=parse(s,tokenize=True,tags=True,chunks=True,encoding='utf-8')
    # lis=x.split(" ")
    lis = nltk.pos_tag(nltk.word_tokenize(s))
    l = []
    # print(lis)
    for ele in lis:
        y = ele[1]
        word = ele[0]
        # if word in WHfam:
        # 	l=l+[joinlist(WHfam)]
        # if word in Demons:
        # 	l=l+[joinlist(Demons)]
        # if word.lower() in Poss:
        # 	for xyz in Possessives:
        # 		if word.lower() in xyz:
        # 			l=l+[joinlist(xyz)]
        # 			break
        if y in Verbs:
            # print(word)
            if word in Aux:
                l = l + [joinlist(Aux)]
            else:
                # print("Hi")
                l = l + [joinlist(lexeme(word))]
        else:
            l = l + [word]
    return " ".join(l)
示例#15
0
def conjugateVerbs(sentence, tense):
    """Use parse trees to identify the verbs in a phrase. Assume the
	first word in the phrase is guaranteed to be a verb. Return the
	phrase with each verb converted to the desired tense."""
    if not sentence: return None
    """pattern-en's conjugation() often does not work, 
	but lexeme() generates conjugations in a predictable order"""
    lexeme_indicies = {"infinitive": 0, "continuous": 2}
    t = lexeme_indicies[tense.lower()]

    words = en.parsetree(sentence)[0]
    words[0].string = en.lexeme(words[0].string)[t]

    for word in words:
        if word.type[0] == "V":
            word.string = en.lexeme(word.string)[t]

    return words.string
示例#16
0
def getArticle(article):
    try:
        #chunks = gc.getChunks(article)
        tags =  tag.getTags(article[1])
        #if tags == []:
        try:
        #    continue # check this is right. go to next itteration
            """The Stanford Open IE tags"""
            subject = tags[-1]['subject']
            relation = tags[-1]['relation']
            objects = tags[-1]['object']
            objects = objects.split(' ')

            relations = []
            relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation))
            relations = en.lexeme(relations[0])
            content = wp.getArticle(subject)
        except:
            #    continue # check this is right. go to next itteration
            """The Stanford Open IE tags"""
            subject = tags[0]['subject']
            relation = tags[0]['relation']
            objects = tags[0]['object']
            objects = objects.split(' ')
            relations = []
            relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation))
            relations = en.lexeme(relations[0])
            content = wp.getArticle(subject)

        #objects = objects.split()
        rawSentences = nltk.tokenize.sent_tokenize(content)#sent.getSentences(content)
        sentences = []
        for sentence in rawSentences:
            for word in objects:
                if word in sentence:
                    sentences.append(sentence)
            for word in relations:
                if word in sentence:
                    sentences.append(sentence)

        sentences = list(set(sentences))
        return {'title':article[1], 'sentences':sentences, 'year':article[0]}
    except:
        return
def check_pos(pos_tag, word):
    if pos_tag == 'NN':
        add(pluralize(word), word, False, "plural")
    elif pos_tag == 'VB':
        for lex in lexeme(word):
            add(lex, word, False, "conjugation")
    elif pos_tag == 'JJ':
        comp = comparative(word)
        add(comp, word, False, "comparative")
        sup = superlative(word)
        add(sup, word, False, "superlative")
示例#18
0
    def regex_or_list_maker(verb_list):
        """makes a regex from the list of words passed to it"""
        # add alternative spellings
        from dictionaries.word_transforms import usa_convert
        from pattern.en import lexeme
        uk_convert = {v: k for k, v in usa_convert.items()}
        to_add_to_verb_list = []
        for w in verb_list:
            if w in usa_convert.keys():
              to_add_to_verb_list.append(usa_convert[w])
        for w in verb_list:
            if w in uk_convert.keys():
              to_add_to_verb_list.append(uk_convert[w])
        verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))

        verbforms = []
        for w in verb_list:
          forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)]
          for f in forms:
              verbforms.append(f)
          # deal with contractions
          if w == 'be':
              be_conts = [r"'m", r"'re", r"'s"]
              for cont in be_conts:
                  verbforms.append(cont)
          if w == "have":
              have_conts = [r"'d", r"'s", r"'ve"]
              for cont in have_conts:
                  verbforms.append(cont)
        
        to_add = []
        for w in verbforms:
            if w in usa_convert.keys():
              to_add.append(usa_convert[w])
        for w in verbforms:
            if w in uk_convert.keys():
              to_add.append(uk_convert[w])
        verbforms = sorted(list(set(verbforms + to_add)))
        t = []

        # ensure unicode
        for w in verbforms:
            if type(w) != unicode:
                t.append(unicode(w, 'utf-8', errors = 'ignore'))
            else:
                t.append(w)
        verbforms = t
        
        if not regex:
            return verbforms
        else:
            return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
示例#19
0
def mangle_agreement(correct_sentence):
    """Given a correct sentence, return a sentence or sentences with a subject
    verb agreement error"""
    # # Examples
    #
    # Back in the 1800s, people were much shorter and much stronger.
    # This sentence begins with the introductory phrase, 'back in the 1800s'
    # which means that it should have the past tense verb. Any other verb would
    # be incorrect.
    #
    #
    # Jack and jill went up the hill.
    # This sentence is different; 'go' would also be correct. If it began with
    # 'Yesterday', a single-word introductory phrase requiring no comma, only
    # 'went' would be acceptable.
    #
    #
    # The man in the checkered shirt danced his warrior dance to show that
    # he was the most dominant male in the room.
    # This sentence has multiple verbs. If the sentence ended at the word dance,
    # changing 'danced' to 'dances' would be acceptable, but since the sentence
    # continues we cannot make this change -- 'was' agrees with 'danced' but not
    # with 'dances'.  This is a shifty tense error, a classic subject verb
    # agreement error.
    #
    # # Our Method
    #
    # Right now, we will assume that any change in verb form of a single verb in
    # a sentence is incorrect.  As demonstrated above, this is not always true.
    # We hope that since any model created off of this data will use a
    # confidence interval to determine likelihood of a subject-verb agreement
    # error, that some number can be found for which the model excels.
    #
    # It would also be possible to use a rule based learner to evaluate single
    # verb sentences, and only evaluating more complex sentences with the
    # tensorflow model.

    bad_sents = []
    doc = nlp(correct_sentence)
    verbs = [(i, v) for (i, v) in enumerate(doc) if v.tag_.startswith('VB')]
    for i, v in verbs:
        for alt_verb in lexeme(doc[i].text):
            if alt_verb == doc[i].text:
                continue  # Same as the original, skip it
            if (tenses(alt_verb) == tenses(v.text) or
                (alt_verb.startswith(v.text) and alt_verb.endswith("n't"))):
                continue  # Negated version of the original, skip it
            new_sent = str(doc[:i]) + " {} ".format(alt_verb) + str(
                doc[i + 1:])
            new_sent = new_sent.replace(' ,', ',')  # fix space before comma
            bad_sents.append(new_sent)
    return bad_sents
示例#20
0
 def inflected_forms(self, syn, desc):
     try:
         word, pos, _ = desc
         if pos == 'Verb':
             from pattern.en import lexeme
             return [w for w in reversed(lexeme(word)) if w != word]
         elif pos == 'Noun':
             from pattern.en import pluralize
             return [pluralize(word)]
         elif pos == 'Adjective':
             from pattern.en import comparative, superlative
             return [comparative(word), superlative(word)]
         else:
             return []
     except ImportError:
         raise MessageException('General', 'unavailable', 'WordData[_, "InflectedForms"]', 'pattern')
示例#21
0
文件: natlang.py 项目: gjvnq/Mathics
 def inflected_forms(self, syn, desc):
     try:
         word, pos, _ = desc
         if pos == 'Verb':
             from pattern.en import lexeme
             return [w for w in reversed(lexeme(word)) if w != word]
         elif pos == 'Noun':
             from pattern.en import pluralize
             return [pluralize(word)]
         elif pos == 'Adjective':
             from pattern.en import comparative, superlative
             return [comparative(word), superlative(word)]
         else:
             return []
     except ImportError:
         raise MessageException('General', 'unavailable', 'WordData[_, "InflectedForms"]', 'pattern')
示例#22
0
def fix_caption(str):
    s = parsetree(str, lemmata=True)
    string = ''
    for sentence in s:
        for i, chunk in enumerate(sentence.chunks):
            if chunk.type == 'VP' and len(chunk) == 2:
                print('@!!')
                verb = chunk[1].string
                string += lexeme(verb)[1] + ' '
            else:
                for j, w in enumerate(chunk.words):
                    if i == 0 and j == 0:
                        pass
                    else:
                        string = string + w.string + ' '

    return string[:1].upper() + string[1:-1]
示例#23
0
def get_ngram(words, word_pos): #從片段取出ngram
    ngrams = []
    for n in range(2,6):
        for i in range(len(words)-n+1):
            for w in words[i:i+n]:
                if word_pos[w] in content_Words: #只取含有實詞的ngram
                    if word_pos[w] == 'V':
                        tense = lexeme(w) #List of possible verb forms: be => is, was, been...
                        for verb in tense:
                            Slice = words[i:i+n]
                            Slice[Slice.index(w)] = verb
                            ngrams.append(Slice)
                    else:
                        ngrams.append(words[i:i+n])
                    continue
    #print ngrams
    return ngrams
示例#24
0
def tenseChecker(sentence,
                 ind):  #sentence is a list of words, ind = index of verb
    trigrams = []
    fourgrams = []
    fourgram_freq = {}
    b = len(sentence[ind - 1:ind + 3]) >= 4
    if b:
        fourgrams = getFourgram(sentence, ind)
    trigram_freq = {}
    trigrams = getTrigrams(sentence, ind)
    # print(trigrams)
    for tri, i in trigrams:
        key = tri[i]
        temp = tri
        # print(tri)
        word_tenses = lexeme(key)
        # print(word_tenses)
        for tense in word_tenses:
            temp[i] = tense
            # print(temp)
            if tense in trigram_freq.keys():
                trigram_freq[tense] += trigramFreq(temp)
                # print(trigramFreq(temp))
            else:
                trigram_freq[tense] = trigramFreq(temp)
    sorted_tenses = sorted(trigram_freq.items(),
                           key=lambda x: x[1],
                           reverse=True)

    for tense in word_tenses:
        temp = fourgrams[0]
        temp[fourgrams[1]] = tense
        if tense in fourgram_freq.keys():
            fourgram_freq[tense] += trigramFreq(temp)
            # print(trigramFreq(temp))
        else:
            fourgram_freq[tense] = trigramFreq(temp)
    sorted_tenses_4 = sorted(fourgram_freq.items(),
                             key=lambda x: x[1],
                             reverse=True)

    print(sorted_tenses)
    print(sorted_tenses_4)
def keywordChecker(book, genres):

		genre_scores = {}
		for each in genres.keys():
			genre_scores[each] = []

		for genre, words in genres.items():

			found = []
			#words is list of words in each genre

			for wd in words: #for each word in a list
				word =  wd[0]
				
				#wd[1] is the point value of that word

				for a in re.finditer(word, book['description']):
					genre_scores[genre].append(wd[1])
					if a.group(0) not in found:
						found.append(a.group(0)) #don't append duplicates because that's a waste of space. 

				# There can also be other forms of a root word. For example, 'fought' is a form of 'fight'.
				# Use the NLP module pattern.en to call lexeme(word) to get the list of possible forms of that word. Works for 2 word keywords too
				allWords = lexeme(word)
				for each in allWords:
					# For the word 'fight', for example, the part where the word is being searched directly in the book, takes care of cases such as 'fight', 'fights', 'fighting', etc words that start with 'fight'.
					# This part where word forms are being searched, looks are cases such as 'fought' which are forms or 'fight' but don't start with 'fight'
					if word not in each: #don't search the forms that have been covered ('fight', 'fighting', etc)
						for b in re.finditer(each, book['description']):
							genre_scores[genre].append(wd[1]) 
							if b.group(0) not in found:
								found.append(b.group(0)) #don't append duplicates because that's a waste of space. 
						# if each in book['description']:
						# 	genre_scores[genre].append(wd[1])
						# 	found.append(each)

		# now, calculate score averages	
		# save score averages as last value in dict for each genre
		for k,v in genre_scores.items():
			if len(v) != 0:
				genre_scores[k].append(len(v) * int(sum(v)/len(v)))

		return genre_scores
示例#26
0
def get_synomyms_token(token):
    stem = stemmer.stem(token)
    synonyms_ = [token]
    if stem in stem2words:
        words = stem2words[stem]
        synonyms_.extend(words)

    w1 = lemmatizer.lemmatize(token, 'v')
    w2 = lemmatizer.lemmatize(token, pos="a")
    w3 = lemmatizer.lemmatize(token)
    w = {w1, w2, w3}
    synonyms_.extend(list(w))

    #synonyms_ = [token]

    for syn in wordnet.synsets(token):
        for l in syn.lemmas():
            synonyms_.append(l.name())

    synonyms_.extend(lexeme(token))
    synonyms = np.array([elm for elm in set(synonyms_)])

    return synonyms
示例#27
0
	def get_questions(self):
		z = self.getText()
		(subj,vp) = (z['NP'][0], z['VP'][0])
                from pattern.en import lexeme, lemma, tenses
		import nltk, re
		tagged = nltk.pos_tag(nltk.word_tokenize(subj + " " + vp))
                verb = ""
		sense = supersense(subj)
		if(sense[0][2][-6:] == 'person' or sense[0][1] == 'PRP'): return ("Who " + vp + "?")
		elif(sense[0][2][-4:] == 'time' or re.match("[1|2]\d\d\d", subj)): return ("When " + vp + "?")
		elif(sense[0][2][-8:] == 'location' and
		('PP' in z and z['PP'].split()[0].lower in ["on", "in", "at", "over", "to"])):
			return ("Where " + vp + "?")
                aux = ["Will","Shall","May","Might","Can","Could","Must","Should","Would","Do","Does","Did"]
                for i in reversed(tagged):
                        if(i[1][0] == 'V'):
                                verb = i[0]
                if((u'' + verb) in lexeme("is")):
                        return (verb.capitalize() + " " + subj.lower() + vp[len(verb):] + "?")
                else:
                        for x in aux:
                                if(tenses(x)[0] == tenses(verb)[0]):
                                        return (x + " " + subj.lower() + " " + lemma(verb) + vp[len(verb):] + "?")
示例#28
0
def make_morph_set(lemma, pos):
    if pos == 'n':
      return set([lemma, en.pluralize(lemma)])
    elif pos == 'v':
      m = set(en.lexeme(lemma))
      m.add(lemma)
      return m
    elif pos == 'a':
      m = set([lemma])

      c = en.comparative(lemma)

      if c and not c.startswith('more '):
          m.add(c)

      s = en.superlative(lemma)

      if s and not s.startswith('most '):
          m.add(s)

      return m
    else:
      return set([lemma])
示例#29
0
def check_gf1(gt_sent, recon_sent):
    ### Check for verb-object tuple; heuristic of matching singluar version and
    ### Same first 3 letters for the verb as shortest verb is of length 3 e.g. Hug :)
    gt_verb = gt_sent[-3]

    verbs = lexeme(gt_verb)  ###returns a list of conjug. verbs
    # print("gt sent is:", gt_sent)
    # print("recon sent is:", recon_sent)
    #    print("list of gt verb alternatives:",verbs)
    gt_obj = gt_sent[-1]
    # print("gt obj is:",gt_obj)
    # print("last word in recon sent:",recon_sent[-1])
    # print("gt verb is:", gt_verb)
    objs = [singularize(gt_obj), pluralize(gt_obj)
            ]  ###return a list with pos1 sing.obj and pos2 pl.obj.
    score = 0.0
    if any(i in verbs for i in recon_sent):
        ###  Reward learning the verb
        score = score + 0.5
    if any(i in objs for i in recon_sent):
        ### Reward learning the object
        score = score + 0.5

    return score
示例#30
0
print

# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() commands give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print word, "=>", comparative(word), "=>", superlative(word)
print
print

# VERB CONJUGATION
# ----------------
# The lexeme() command returns a list of all possible verb inflections.
# The lemma() command returns the base form (infinitive) of a verb.
print "lexeme:", lexeme("be")
print "lemma:", lemma("was")

# The conjugate() command inflects a verb to another tense.
# The tense can be given as a constant, e.g. 
# INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ...
# or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
print conjugate("being", tense="1sg", negated=False)

# Prefer the full constants for code that will be reused/shared.

# The tenses() command returns a list of all tenses for the given verb form.
# For example: tenses("are") => ['present 2nd person singular', 'present plural']
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT_PLURAL
示例#31
0
import io
from google.cloud import vision
import re
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import wordnet as wn
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG
import math

answer = ['i', 'am', 'eat', 'eating', 'in', 'a', 'mine', 'mined']
keywords = ['eat', 'mine', 'shine']
extended_keywords = []
keywords_matched = 0
for word in keywords:
    temp_list = []
    temp_list.extend(lexeme(word))
    for happy_lemma in wn.lemmas(word):  #for each "happy" lemma in WordNet
        temp_list.append(happy_lemma.name())  #add the lemma itself
        for related_lemma in happy_lemma.derivationally_related_forms(
        ):  #for each related lemma
            temp_list.append(related_lemma.name())  #add the related lemma
    temp_list = list(set(temp_list))
    extended_keywords.append((word, temp_list))

keywords_dictionary = {key: value for (key, value) in extended_keywords}
# print(keywords_dictionary)

for (keyword, keyword_list) in keywords_dictionary.items():
    for word in keyword_list:
        if word in answer:
            keywords_matched = keywords_matched + 1
示例#32
0
def verb_vocab(tcm = None, postagger = None, min_length=0):
    """
    Return all verbs found in wordnet in various inflected forms.
    """
    if not postagger:
        postagger = BackoffTagger.from_pickle()

    getpostag = lambda word : postagger.tag([word])[0][1]

    # Most of the time lexeme() returns 4 or 5 words, inflected as declared below
    # To avoid assumptions on the tagset used, we query the tags using easy examples
    # (verb give). These POS tags are then bound to lexeme's results.
    infinitive_pos = getpostag("give")
    present_pos    = getpostag("gives")
    pres_prog_pos  = getpostag("giving")
    past_pos       = getpostag("gave")
    past_prog_pos  = getpostag("given")

    # three possibilities for return of function tenses
    # depending on how many variations a verb has
    tenses3 = [infinitive_pos, present_pos, pres_prog_pos]
    tenses4 = tenses3 + [past_pos]
    tenses5 = tenses4 + [past_prog_pos]

    verbs = set()

    for lemma in wn.all_lemma_names(pos = 'v'):
        if len(lemma) < min_length:
            continue
        if '_' in lemma:
            continue

        forms = lexeme(lemma) # all possible conjugations of this verb (lemma)

        if len(forms) == 3:
            forms = zip(forms, tenses3)
        elif len(forms) == 4:
            forms = zip(forms, tenses4)
        elif len(forms) == 5:
            forms = zip(forms, tenses5)
        else:
            # this step can introduce errors, as getpostag isn't
            # guaranteed to return a verb tag
            forms = [(form, getpostag(form)) for form in forms]

        # ignore forms that do not map back to lemma by wordnet's
        # lemmatizer, as they are likely erroneous
        forms = list(filter(lambda form: lemma in wn._morphy(form[0], 'v'), forms))

        if tcm is not None:
            classes = [classy for syn in wn.synsets(lemma, 'v') for classy in tcm.predict(syn)]
        else:
            classes = [syn.name() for syn in wn.synsets(lemma, 'v')]

        for classy in classes:
            for form, postag in forms:
                if not postag:
                    log.warning("{} has POS==None".format(form))
                    continue
                if postag[0] == 'n': # dirty hack to avoid inconsistency introduced by postagger
                    continue
                verbs.add((form, postag, classy))
                if "'" in form:  # remove ' (couldn't -> couldnt)
                    verbs.add((form.replace("'", ""), postag, classy))

    return verbs
示例#33
0
print()

# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() functions give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print(word, "=>", comparative(word), "=>", superlative(word))
print()
print()

# VERB CONJUGATION
# ----------------
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print("lexeme:", lexeme("be"))
print("lemma:", lemma("was"))
print()

# The conjugate() function inflects a verb to another tense.
# You can supply: 
# - tense : INFINITIVE, PRESENT, PAST, 
# - person: 1, 2, 3 or None, 
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g., 
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False))
print(conjugate("being", tense="1sg", negated=False))
示例#34
0
print(singularize("our", pos=ADJECTIVE))
print("")

# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() functions give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print("%s => %s => %s" % (word, comparative(word), superlative(word)))
print("")

# VERB CONJUGATION
# ----------------
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print("lexeme: %s" % lexeme("be"))
print("lemma: %s" % lemma("was"))
print("")

# The conjugate() function inflects a verb to another tense.
# You can supply:
# - tense : INFINITIVE, PRESENT, PAST,
# - person: 1, 2, 3 or None,
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g.,
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False))
print(conjugate("being", tense="1sg", negated=False))
示例#35
0
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG

print(lexeme('gave'))
print(referenced('university'))
print(referenced('hour'))

from pattern.en import pluralize, singularize

print(pluralize('child'))
print(singularize('wolves'))

from pattern.en import comparative, superlative

print(comparative('bad'))
print(superlative('bad'))

from pattern.en import conjugate, lemma, lexeme

print(lexeme('purr'))
print(lemma('purring'))
print(conjugate('purred', '3sg'))  # he / she / it

from pattern.en import conjugate, lemma, lexeme

print(lexeme('purr'))
print(lemma('purring'))
print(conjugate('purred', '3sg'))  # he / she / it

from pattern.de import gender, MALE, FEMALE, NEUTRAL
print(gender('Katze'))
print(gender('Mesa'))

from pattern.en import verbs, conjugate, PARTICIPLE
#Indefinite article
print article('university')
print article('hour')

print referenced('university')
print referenced('hour')


#singularity
print pluralize('child')
print singularize('wolves')

#
print 
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred') # 'p' in tenses() also works.
print (PAST, 1, PL) in tenses('purred') 

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)

示例#38
0
def get_marks(data, image_file):

    keywords_matched = 0
    #maximum_marks = 5
    maximum_marks = data[0]

    keywords = []
    keywords = data[3].split(',')
    # keywords=['data','mine','database','characterization','knowledge','background','task','classify','associate','visualize','predict','cluster']
    expected_keywords = len(keywords)

    #expected_no_of_words = 200
    expected_no_of_words = data[1]

    #expected_no_of_sentences = 15
    expected_no_of_sentences = data[2]

    # extended_keywords = []
    # for word in keywords:
    #     for syn in wn.synsets(word):
    #         for l in syn.lemmas():
    #             extended_keywords.append(l.name())

    forms = [
    ]  #We'll store the derivational forms in a set to eliminate duplicates
    for word in keywords:
        for happy_lemma in wn.lemmas(word):  #for each "happy" lemma in WordNet
            forms.append(happy_lemma.name())  #add the lemma itself
            for related_lemma in happy_lemma.derivationally_related_forms(
            ):  #for each related lemma
                forms.append(related_lemma.name())  #add the related lemma

    verb = []
    for word in keywords:
        verb.extend(lexeme(word))

    # keywords.extend(extended_keywords)
    keywords.extend(forms)
    keywords.extend(verb)

    keywords = [x.lower() for x in keywords]
    keywords = list(set(keywords))
    # print(keywords)
    with io.open(image_file, 'rb') as image_file:
        content = image_file.read()
    image = vision.types.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
    string = texts[0].description.replace(
        '\n', ' ').lower()  #for converting to lower case
    string = re.sub('[^A-Za-z0-9.]+', ' ',
                    string)  #for eliminating special character

    print string

    word_list = word_tokenize(string)  #for word spliting
    no_of_words = len(word_list)
    if no_of_words > expected_no_of_words:
        no_of_words = expected_no_of_words

    no_of_sentences = len(sent_tokenize(string))
    if no_of_sentences > expected_no_of_sentences:
        no_of_sentences = expected_no_of_sentences
    print 'no_of_words', no_of_words
    print 'no_of_sentences', no_of_sentences

    for keyword in keywords:
        if (keyword in word_list):
            keywords_matched = keywords_matched + 1
    if keywords_matched > expected_keywords:
        keywords_matched = expected_keywords
    print 'keywords matched', keywords_matched

    keywords_percentage = 0.55 * (keywords_matched / expected_keywords)
    word_percentage = 0.35 * (no_of_words / expected_no_of_words)
    sentence_percentage = 0.10 * (no_of_sentences / expected_no_of_sentences)

    print 'keywords_percentage', keywords_percentage
    print 'word_percentage', word_percentage
    print 'sentence_percentage', sentence_percentage

    total_marks = maximum_marks * (keywords_percentage + word_percentage +
                                   sentence_percentage)
    total_marks = round(total_marks, 1)
    digit = total_marks * 10
    if (digit % 10 < 5):
        total_marks = math.floor(total_marks)
    if (digit % 10 > 5):
        total_marks = math.ceil(total_marks)
    print 'total_marks', total_marks
    return total_marks
示例#39
0
print

# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() functions give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print word, "=>", comparative(word), "=>", superlative(word)
print
print

# VERB CONJUGATION
# ----------------
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print "lexeme:", lexeme("be")
print "lemma:", lemma("was")
print

# The conjugate() function inflects a verb to another tense.
# You can supply:
# - tense : INFINITIVE, PRESENT, PAST,
# - person: 1, 2, 3 or None,
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g.,
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print conjugate("being",
                tense=PRESENT,
    def get_verb_tense_forms(self, verb):
        all_tense_forms = lexeme(verb)

        return all_tense_forms
示例#41
0
            inflections.intersection_update(wordlist)

            print(inflections)

        except:
            pass
            # print("Unexpected error")

    return inflections


keyfile = open("english_inflections.txt", "w", encoding='utf-8')

try:
    print(lexeme('be'))
except:
    pass

print(lexeme('be'))
print(lexeme('conclusion'))
print(lexeme('harlot'))

print(pluralize('be'))
print(pluralize('conclusion'))
print(pluralize('harlot'))

for k in keys:
    # print(lexeme(k))
    # print(pluralize(k))
示例#42
0
    <mbp:frameset>
      <mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0">
        <div align="center" bgcolor="yellow"/>
        <a onclick="index_search()">Dictionary Search</a>
        </div>
      </mbp:slave-frame>
      <mbp:pagebreak/>
""")

    dt, dd =  r.split('\t',1)
    if not UTFINDEX:
        dt = normalizeUnicode(dt,'cp1252')
        dd = normalizeUnicode(dd,'cp1252')
    dtstrip = normalizeUnicode( dt ).strip()
    dd = dd.replace("\\\\","\\").replace("\\n","<br/>\n")
    forms = Set(lexeme(dt))
    forms.add(pluralize(dt))
    
    toremove = Set()
    for w in forms:
        if w not in wordlist:
#            print("Remove %s" % w)            
            toremove.add(w)
    removed += len(toremove)
    
    forms.difference_update(toremove)        
    
    inflections = ''
    if len(forms):
        inflections = '\t\t\t<idx:infl>\n'
        for f in forms:
示例#43
0
文件: tab2opf.py 项目: tuan188/tudien
    <mbp:frameset>
      <mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0">
        <div align="center" bgcolor="yellow"/>
        <a onclick="index_search()">Dictionary Search</a>
        </div>
      </mbp:slave-frame>
      <mbp:pagebreak/>
""")

    dt, dd = r.split('\t', 1)
    if not UTFINDEX:
        dt = normalizeUnicode(dt, 'cp1252')
        dd = normalizeUnicode(dd, 'cp1252')
    dtstrip = normalizeUnicode(dt).strip()
    dd = dd.replace("\\\\", "\\").replace("\\n", "<br/>\n")
    forms = Set(lexeme(dt))
    forms.add(pluralize(dt))

    toremove = Set()
    for w in forms:
        if w not in wordlist:
            #            print("Remove %s" % w)
            toremove.add(w)
    removed += len(toremove)

    forms.difference_update(toremove)

    inflections = ''
    if len(forms):
        inflections = '\t\t\t<idx:infl>\n'
        for f in forms:
示例#44
0
    if options.m2_raw == '' or options.pos == '':
        sys.stderr.write("Usage: python make_verbforms.py -l [gec lemma file] -f [gec raw file] -p [pos file] > [output file]\n")
        exit(1)
    else:
        pass
    pos_lines = codecs.open(options.pos, 'r', 'utf8').readlines()
    sent_lines = codecs.open(options.m2_raw, 'r', 'utf8').readlines()
    lemma_lines = codecs.open(options.lemma_file, 'r', 'utf8').readlines()
    verbforms = {}
    for lemma_line, pos_line, sent_line in zip(lemma_lines, pos_lines, sent_lines):
        for lem, pos, word in zip(lemma_line.strip().split(),pos_line.strip().split(), sent_line.strip().split()):
            word = word.lower()
            word_caps_all = word.upper()
            word_caps = word.capitalize()
            if word in verbforms:
                continue
            if pos.startswith('VB'):
                vforms = []
                lem = lem.lower()
                vforms = lexeme(lem)
                vforms = get_conjugations(lem)
                #vforms = [c for c in vforms if (("n't" not in c) and (len(c.split())==1) and d.check(c))]
                for v_aspect in vforms:
                    v,aspect = v_aspect.split('|||')
                    verbforms[v] = vforms
            else:
                pass  # not a noun or noun_pl

    for k, v in sorted(verbforms.iteritems()):
        print k, ' '.join(v)