Exemplo n.º 1
0
def dict_ingest(path_to_dict):
    noun = []
    verb = []
    adjective = []
    adverb = []
    miscel = []
    f = open(path_to_dict, 'r')
    for l in f:
        word = l.strip()
        if en.is_noun(word):
            noun.append(word)
        elif en.is_verb(word):
            verb.append(word)
        elif en.is_adjective(word):
            adjective.append(word)
        elif en.is_adverb(word):
            adverb.append(word)
        else:
            miscel.append(word)
    print noun[:5]
    print verb[:5]
    print adjective[:5]
    print adverb[:5]
    print miscel[:5]
    return noun, verb, adjective, adverb, miscel
Exemplo n.º 2
0
def dict_ingest(path_to_dict):
    noun = []
    verb = []
    adjective = []
    adverb = []
    miscel = []
    f = open(path_to_dict,'r')
    for l in f:
        word = l.strip()
        if en.is_noun(word):
            noun.append(word)
        elif en.is_verb(word):
            verb.append(word)
        elif en.is_adjective(word):
            adjective.append(word)
        elif en.is_adverb(word):
            adverb.append(word)
        else:
            miscel.append(word)
    print noun[:5]
    print verb[:5]
    print adjective[:5]
    print adverb[:5]
    print miscel[:5]
    return noun, verb, adjective, adverb, miscel
Exemplo n.º 3
0
def verse(word):
    
    """Creates a small rhyme for a given word.
    
    The rhyme is based on WordNet's description for the word.
    This description is eloquated (alliterated or antonated), incorporated.
    
    """

    g = en.noun.gloss(word)
    words = g.split(" ")
    
    for i in range(len(words)):
        
        w = words[i]
        w = w.replace("\"", "")
        
        if en.is_noun(w):
            w = eloquate(w)
            
        if random(100) > 60:

            if en.is_noun(w): w = incorporate(w).upper()
            if en.is_verb(w): w = incorporate(w, VERB)
            if en.is_adjective(w): w = incorporate(w, ADJECTIVE)
            
        if i > 0 and i % 3 == 0:
            words[i] = words[i] + "\n"
            
        words[i] = w
            
    g = " ".join(words)
    g = g.replace("type A ", "!")
    g = g.replace("group A ", "!")
    return g
Exemplo n.º 4
0
def simplify_word(a):

    # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a),

    try:#测试是否为动词,如果是则返回
        try_present_verb = en.verb.present(a)#try
        if en.is_verb(try_present_verb):
            # if try_present_verb != a:
            #     print " 动词现在时化:{0} -> {1}".format(a,try_present_verb)
            # else:
            #     print ""
            return try_present_verb
    except:#否则继续检查
        pass

    #测试是否是名词
    try_singular_noun = en.noun.singular(a)
    if en.is_noun(try_singular_noun):
        # if try_singular_noun != a:
        #     print " 名词单数化:{0} -> {1}".format(a,try_singular_noun)
        # else:
        #     print ""
        return try_singular_noun

    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        # print ""
        return a

    return ''
Exemplo n.º 5
0
def verse(word):

    """Creates a small rhyme for a given word.

    The rhyme is based on WordNet's description for the word.
    This description is eloquated (alliterated or antonated), incorporated.

    """

    g = en.noun.gloss(word)
    words = g.split(" ")

    for i in range(len(words)):

        w = words[i]
        w = w.replace("\"", "")

        if en.is_noun(w):
            w = eloquate(w)

        if random(100) > 60:

            if en.is_noun(w): w = incorporate(w).upper()
            if en.is_verb(w): w = incorporate(w, VERB)
            if en.is_adjective(w): w = incorporate(w, ADJECTIVE)

        if i > 0 and i % 3 == 0:
            words[i] = words[i] + "\n"

        words[i] = w

    g = " ".join(words)
    g = g.replace("type A ", "!")
    g = g.replace("group A ", "!")
    return g
Exemplo n.º 6
0
    def find_grammatical_kind(self):

        st = self.get_sentence()
        st = re.sub(",", "", st)  # delete all commas
        result = []

        m = st.split(" ")

        for each in m:
            flag = False
            if en.noun.is_emotion(each):
                result.append("emotion")
                flag = True
            elif en.is_connective(each):
                result.append("connective")
                flag = True
            elif en.is_verb(each):
                result.append("verb")
                flag = True
            elif en.is_adjective(each):
                result.append("adjective")
                flag = True
            elif en.is_noun(each):
                result.append("noun")
                flag = True
            elif en.is_persuasive(each):
                result.append("persuasive")
                flag = True
            elif en.is_number(each):
                result.append("number")
                flag = True
            if flag == False:
                result.append("unclear")

        return result
Exemplo n.º 7
0
    def singular_to_plural(self):

        final_list = []
        st = self.get_sentence()

        list_seperate_by_comma = st.split(",")  # divide the sentence to list of strings by all the ','
        for each in list_seperate_by_comma:

            if each[0] == " ":  # prevent bug
                each = each[1:]
            m = each.split(" ")  # split each sentence to list of words

            plural_list = []

            for each in m:
                if en.is_noun(each):
                    each = en.noun.plural(each)
                elif en.is_adjective(each):
                    each = en.adjective.plural(each)
                elif en.is_connective(each):
                    each = self.my_inflect.plural(each)
                elif en.is_persuasive(each):
                    each = en.persuasive.plural(each)
                plural_list.append(each)

            plural_list = " ".join(plural_list)  # convert each list to string
            final_list.append(plural_list)

        final_list = ", ".join(final_list)
        return final_list
Exemplo n.º 8
0
def translate_x_of_assertion(brain, a):
    prefix = a.relation[:-3]
    prefix_article = en.noun.article(prefix)
    # prefix_article_only = prefix_article.split(" ")[0]
    verb = get_tense(a, "was", a.l, brain)

    toReturn = ""
    if en.is_noun(en.noun.singular(prefix)):
        if is_plural(a.l, brain):
            prefix_article = en.noun.plural(prefix)
        toReturn = list_concepts_naturally(
            brain, a.l
        ) + " " + verb + " " + prefix_article + " of " + list_words_naturally(
            a.r)
    elif en.is_verb(
            en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) != "":
        if hasattr(a, "owner") and len(a.owner) > 0:
            owner = list_concepts_naturally(brain, a.owner)
        else:
            owner = "everyone"
        toReturn = list_concepts_naturally(
            brain, a.l
        ) + " " + prefix + " " + owner + " of " + list_concepts_naturally(
            brain, a.r)
    elif en.is_adjective(prefix):
        # TODO for capable_of >> deal with action, action_object, action_recipient...
        # Similar for used_for >> when used_for is action / verbs
        toReturn = list_concepts_naturally(
            brain,
            a.l) + " " + verb + " " + prefix + " of " + list_words_naturally(
                a.r)
    toReturn = add_end_marks(a, toReturn)
    return toReturn
Exemplo n.º 9
0
def simplify_word(a):

    # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a),

    try:  #测试是否为动词,如果是则返回
        try_present_verb = en.verb.present(a)  #try
        if en.is_verb(try_present_verb):
            # if try_present_verb != a:
            #     print " 动词现在时化:{0} -> {1}".format(a,try_present_verb)
            # else:
            #     print ""
            return try_present_verb
    except:  #否则继续检查
        pass

    #测试是否是名词
    try_singular_noun = en.noun.singular(a)
    if en.is_noun(try_singular_noun):
        # if try_singular_noun != a:
        #     print " 名词单数化:{0} -> {1}".format(a,try_singular_noun)
        # else:
        #     print ""
        return try_singular_noun

    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(
            a) or en.is_connective(a):
        # print ""
        return a

    return ''
Exemplo n.º 10
0
 def is_a_expression(self, word):
     return self.is_a_hash_tag(word)\
            or self.is_negation(word) \
            or en.is_noun(word) \
            or en.is_adjective(word) \
            or en.is_verb(word) \
            or en.is_adverb(word) \
            or self.is_orality(word)
Exemplo n.º 11
0
 def is_a_expression(self, word):
     return self.is_a_hash_tag(word)\
            or self.is_negation(word) \
            or en.is_noun(word) \
            or en.is_adjective(word) \
            or en.is_verb(word) \
            or en.is_adverb(word) \
            or self.is_orality(word)
Exemplo n.º 12
0
def adjectives(list):
    
    """Parses adjectives from a list of words.
    """
    
    words = []
    for word in list:
        word = word.strip()
        if en.is_adjective(word): words.append(word)
    
    return words
Exemplo n.º 13
0
Arquivo: views.py Projeto: mitnk/mc
def get_gloss(word):
    if en.is_verb(word):
        return en.verb.gloss(word)
    elif en.is_adjective(word):
        return en.adjective.gloss(word)
    elif en.is_adverb(word):
        return en.adverb.gloss(word)
    elif en.is_noun(word):
        return en.noun.gloss(word)
    else:
        return en.wordnet.gloss(word)
Exemplo n.º 14
0
    def giveNearestEmotion(self, word):
        if en.is_verb(word):
            return en.verb.is_emotion(word, boolean=False)

        if en.is_adverb(word):
            return en.adverb.is_emotion(word, boolean=False)

        if en.is_adjective(word):
            return en.adjective.is_emotion(word, boolean=False)

        return en.noun.is_emotion(word, boolean=False)
Exemplo n.º 15
0
def adjectives(list):

    """Parses adjectives from a list of words.
    """

    words = []
    for word in list:
        word = word.strip()
        if en.is_adjective(word): words.append(word)

    return words
Exemplo n.º 16
0
def generate_word(list, pos):
    #% chance to generate new word
    if random.random() < percentage_chance:
        #repeat until word = pos
        while True:
            #get all synsets of random word in list
            synsets = wn.synsets(list[random.randint(0, len(list) - 1)], pos=pos)
            #get random synset
            synset = synsets[random.randint(0, len(synsets) - 1)]
            ran = random.randint(0,3)
            if ran == 0 and synset.hypernyms():
                synset = synset.hypernyms()[random.randint(0, len(synset.hypernyms()) - 1)]
            elif ran == 1 and synset.hyponyms():
                synset = synset.hyponyms()[random.randint(0, len(synset.hyponyms()) - 1)]
            #get random name from synset that does not contain an _ or - (these make the lib go insane)
            #words = the names of the synset
            words = synset.lemma_names()
            #this loop is to make sure an infinite loop does not occur
            #where you are picking from all invalid choices
            while len(words) > 0:
                word = words[random.randint(0, len(words) - 1)]
                if "_" not in word and "-" not in word:
                    break
                else:
                    words.remove(word)
                    continue
            #if words doesn't have words in it, pick a new word from beginning
            if(len(words) == 0):
                continue
            if ((pos == wn.NOUN and en.is_noun(word)) or 
                (pos == wn.VERB and en.is_verb(word)) or
                (pos == wn.ADJ and en.is_adjective(word))):
                
                #fix word based on pos
                #if verb, make sure the verb has a conjugation,
                #if it does, or is not a verb, the word gets appended to the word array,
                #and a word is returned 
                if pos == wn.VERB:
                    try:
                        en.verb.present(word, person=3, negate=False)
                    except KeyError:
                        continue
                    else:
                        if word not in list:
                            list.append(word)
                        return word
                else:
                    if word not in list:
                        list.append(word)
                    return word
    else:
        #just select a random word from the existing ones
        return list[random.randint(0, len(list) - 1)]
Exemplo n.º 17
0
Arquivo: views.py Projeto: mitnk/mc
def normalize(word):
    ## TODO: make this function nicer (UT, shorter).

    ## all verb to present
    try:
        new_word = en.verb.present(word)
        if new_word != word and en.is_verb(new_word):
            return new_word
    except KeyError:
        pass

    new_word = en.noun.singular(word)
    if new_word != word and en.is_noun(new_word):
        return new_word

    if en.is_noun(word):
        new_word = re.sub(r'er$', '', word)
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'r$', '', word)
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'ment$', '', word)
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'ness', '', word)
        if new_word != word and en.is_adjective(new_word):
            return new_word

    ## adv to adj
    ## TODO: is there a quick way to do this in "en" libs
    new_word = re.sub(r'ly$', '', word)
    if new_word != word and en.is_adjective(new_word):
        return new_word

    if word.endswith('ly'):
        new_word = re.sub(r'ly$', '', word) + 'e'
        if new_word != word and en.is_adjective(new_word):
            return new_word

    if en.is_adjective(word):
        new_word = re.sub(r'ory$', '', word) + 'e'
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'ive$', '', word) + 'e'
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'ive$', '', word)
        if new_word != word and en.is_verb(new_word):
            return new_word
        new_word = re.sub(r'er$', '', word)
        if new_word != word and en.is_adjective(new_word):
            return new_word
        new_word = re.sub(r'r$', '', word)
        if new_word != word and en.is_adjective(new_word):
            return new_word

    return word
Exemplo n.º 18
0
def get_article(word, tokens, index):
    article_index = index - 1

    if index <= 0:
        return tokens[0]

    if not is_noun(word) and not is_adjective(word) and not is_adverb(word):
        return tokens[article_index]

    if tokens[article_index] == 'a' or tokens[article_index] == 'an':
        proper_article = noun.article(word).split()[0]
        return proper_article

    return tokens[article_index]
Exemplo n.º 19
0
def simplify_word(a):
    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        return a
    try:#测试是否为动词,如果是则返回
        en.is_verb(en.verb.present(a))
        return en.verb.present(a)
    except:#否则继续检查
        pass
    
    #测试是否是名词
    if en.is_noun(en.noun.singular(a)):
        return en.noun.singular(a)
    otherwordlist.append(a)
    #print a
    return a
Exemplo n.º 20
0
 def getcategory(self,word):
     #Higher prirority for verb
     try:
         if(en.verb.present(word)):
             return("v")
     except:
         pass
     
     #Check if it is a noun
     if(en.is_noun(word)):
         return("n")
     
     #Check if it is an adjective
     elif(en.is_adjective(word)):
         return("a")
         
     else:
         return(None)
Exemplo n.º 21
0
    def getcategory(self, word):
        #Higher prirority for verb
        try:
            if (en.verb.present(word)):
                return ("v")
        except:
            pass

        #Check if it is a noun
        if (en.is_noun(word)):
            return ("n")

        #Check if it is an adjective
        elif (en.is_adjective(word)):
            return ("a")

        else:
            return (None)
Exemplo n.º 22
0
def convertVerb(srclst):
    dstlst = []
    itemnew = ""
    for item in srclst:
        #print(item)  ############################when nos lib give error
        #if (item.endswith("ed") or item.endswith("ing")) \
        if en.is_verb(item) \
            and (not en.is_noun(item)) \
            and (not en.is_adjective(item)) \
            and (not en.is_adverb(item)) \
            and (item not in WIERDWORDS):
            try:
                itemnew = en.verb.present(item)
            except:
                print "unrecognized word:", item
                itemnew = item
        else:
            itemnew = item
        dstlst.append(itemnew)
    return dstlst
Exemplo n.º 23
0
def simplify_word(a):
    
    try:#测试是否为动词,如果是则返回
        en.is_verb(en.verb.present(a))
        return en.verb.present(a)
    except:#否则继续检查
        pass
    
    #测试是否是名词
    if en.is_noun(en.noun.singular(a)):
        return en.noun.singular(a)
    
    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        return a
        
    
    
    
    otherwordlist.append(a)
    return a
Exemplo n.º 24
0
def translate_x_of_assertion(brain,a):
    prefix = a.relation[:-3]
    prefix_article = en.noun.article(prefix)
    # prefix_article_only = prefix_article.split(" ")[0]
    verb = get_tense(a, "was", a.l, brain)

    toReturn = ""
    if en.is_noun(en.noun.singular(prefix)):
        if is_plural(a.l, brain):
            prefix_article = en.noun.plural(prefix)
        toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix_article + " of " + list_words_naturally(a.r)
    elif en.is_verb(en.verb.infinitive(prefix)) and en.verb.infinitive(prefix) !="":
        if hasattr(a,"owner") and len(a.owner)>0:
            owner = list_concepts_naturally(brain, a.owner)
        else:
            owner = "everyone"
        toReturn = list_concepts_naturally(brain, a.l) + " "+prefix +" "+owner+ " of " + list_concepts_naturally(brain, a.r)
    elif en.is_adjective(prefix):
        # TODO for capable_of >> deal with action, action_object, action_recipient...
        # Similar for used_for >> when used_for is action / verbs
        toReturn = list_concepts_naturally(brain,a.l) + " "+verb+" " + prefix + " of " + list_words_naturally(a.r)
    toReturn = add_end_marks(a, toReturn)
    return toReturn
Exemplo n.º 25
0
def get_adj(tokens_tagged):
    
    r = re.compile(r'[^a-zA-Z]')
    adj = []
    for i in range(len(tokens_tagged)):
        s = tokens_tagged[i]
        for j in range(len(s)):
            (w, t) = s[j]
            if t and t.startswith('J') and not r.match(w):
                adj.append([w, t, unicode(w.lower()), i, j])

    count = defaultdict(int)
    for liste in adj:
        count[liste[2]] += 1
    
    sorted_counts = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
    sorted_counts = [(w, c) for (w, c) in sorted_counts if en.is_adjective(w)]

    adj_all = []
    for (w, c) in sorted_counts:
        liste = [l for l in adj if l[2]== w]
        adj_all.append([w, c, liste])

    return adj_all
    def __init__(self, w, isTop): #maybe add time of post, what subreddit it came from?
        self.words = w
        self.verbCount = 0;
        self.nounCount = 0;
        self.adjCount = 0;
        self.connectiveCount = 0;
        self.other = 0

        global topVerb
        global topVerb
        global topNoun
        global topAdj
        global topCon
        global topOther
        global topCount

        global botVerb
        global botNoun
        global botAdj
        global botCon
        global botOther
        global botCount

        self.count = 0
        for word in self.words:
            self.count += 1
            fixedWord = unicode(word).lower()
            if en.is_verb(fixedWord):
                if(isTop):
                    topVerb += 1
                else:
                    botVerb += 1

                self.verbCount += 1
            elif en.is_noun(fixedWord):
                if(isTop):
                    topNoun += 1
                else:
                    botNoun += 1

                self.nounCount += 1
            elif en.is_adjective(fixedWord):
                if(isTop):
                    topAdj += 1
                else:
                    botAdj += 1

                self.adjCount += 1
            elif en.is_connective(fixedWord):
                if(isTop):
                    topCon += 1
                else:
                    botCon += 1

                self.connectiveCount += 1
            else:
                if(isTop):
                    topOther += 1
                else:
                    botOther += 1

                self.other += 1
        if isTop:
            topCount += self.count
        else:
            botCount += self.count
Exemplo n.º 27
0
import re
import en

if __name__ == "__main__":
    print(en.is_adjective("accomplished"))
    print(en.is_noun("wizard"))
    print(en.is_verb("accomplish"))
    print(
        en.parser.sentence_tag(
            "The day after today, before yesterday. And in pase years, later"))
    en.parser.matches(
        "The day after today, before yesterday. And in pase years, later",
        "JJ NN")
Exemplo n.º 28
0
def get_frequncy_dist(dir_path):
    files = os.listdir(dir_path)

    all_words = 0
    words_wt_freq = {}   
    '''get words'''
    for filename in files:
        if (filename.endswith('.srt')):
            file_handler = open(dir_path + '\\' + filename, 'r')
            for line in file_handler :
                for word in line.strip().split():
                    sword = word.strip(punctuation)
                    if (sword.isalpha()):
                        lword = sword.lower()
                        words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1
                        all_words += 1
            file_handler.close()
    logger.debug('# all words: ' + str (all_words - 1))
    logger.debug('# unique words: ' + str (len(words_wt_freq.keys())))
    lexical_diversity_for_freq(words_wt_freq.values())
    
    lemmatized_words_wt_freq = {}
    for word in words_wt_freq.keys():
        lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word)
        if (word != lemmatized_word and lemmatized_word != None):
            lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word)
            #print(lemmatized_word, word)
        else:
            lemmatized_words_wt_freq[word] = words_wt_freq.get(word)
    lemmatized_size = len(lemmatized_words_wt_freq.keys())            
    logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size))
    lexical_diversity_for_freq(lemmatized_words_wt_freq.values())
    words_wt_freq = {} # Save memory

    
    stopwords_en = stopwords.words('english')
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    comparative = swadesh.words('en')
    ignore_list = [] ;
    ignore_list.extend(stopwords_en)
    ignore_list.extend(male_names)
    ignore_list.extend(female_names)
    ignore_list.extend(comparative)            
    filtered_words = []

    out_file = open(dir_path + '\\wfd.csv', 'w')
    out_file.write ('Word, Type, Frequency \n')
        
    for word in lemmatized_words_wt_freq.keys():
        if len(word) > 2 and word not in ignore_list:
            filtered_words.append(word)   
        else:
            out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words)))
    ignore_list = [] #save memory

    '''wordnet has 155k'''                                 
    usual_words = []
    for word in  filtered_words:
        if (len(wordnet.synsets(word)) != 0):
            usual_words.append(word)
        else:
            out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words)))
    filtered_words = [] # save memory 

    tag_filtered_words_wt_freq = {}
    words_wt_tags = nltk.pos_tag(usual_words)
    for (word, tag) in words_wt_tags:
        if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']):
            if(en.is_adverb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADV,' + word)
            elif (en.is_adjective(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADJ,' + word)
            elif (en.is_verb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('VB,' + word)
            elif (en.is_noun(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('N,' + word) 
            else:
                if (tag in ['VBZ', 'NNS']):
                    if word.endswith('s'):
                        new_word = word[:-1]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                        #print (word , new_word,tag)    
                elif (tag == 'VBG'):
                    new_word = en.verb.infinitive(word)
                    if new_word != None and word != new_word:
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                elif (tag == 'JJS'):
                    if word.endswith('est'):
                        new_word = word[:-3]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)     
                else:
                    tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]        
                    #print (word,tag)   
        else:
            out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys())))
    lexical_diversity_for_freq(tag_filtered_words_wt_freq.values())
    lemmatized_words_wt_freq = {} # save memory
    usual_words = [] #save memory

    basic_english_vocab = en.basic.words
    non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab)
    non_basic_words_wt_freq = {}
    for non_basic_word in non_basic_words:
        non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] 
    words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys())))
    lexical_diversity_for_freq(non_basic_words_wt_freq.values())
    tag_filtered_words_wt_freq = {} #save memory


    fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r')
    my_words = [word.lower() for line in fh for word in line.strip().split()]
    fh.close()
    new_words = set(non_basic_words).difference(my_words)
    words_in_both = set(non_basic_words).intersection(my_words)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n')    
    new_words_wt_freq = {}
    for new_word in new_words:
        new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] 
    logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys())))
    lexical_diversity_for_freq(new_words_wt_freq.values())
    
    sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0))
    for (word, frequency) in sorted_words:
        out_file.write (word + ',lexicon,' + str(frequency) + '\n')
    out_file.close()
    
    return new_words_wt_freq
Exemplo n.º 29
0
 def is_major(word):
     return en.is_verb(word) or en.is_adjective(word) or\
     en.is_adverb(word) or (word in MODAL_VERBS)
Exemplo n.º 30
0
def autoPlural(word):
	if en.is_adjective(word):
		return en.plural.adjective_plural(word)
	else:
		return en.plural.noun_plural(word)
Exemplo n.º 31
0
def valid_pos(word):
    if not is_noun(word) and not is_verb(word) and not is_adjective(
            word) and not is_adverb(word) and len(word) < 7:
        return False

    return True