def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict, 'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try:#测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a)#try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except:#否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): # print "" return a return ''
def simplify_word(a): # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a), try: #测试是否为动词,如果是则返回 try_present_verb = en.verb.present(a) #try if en.is_verb(try_present_verb): # if try_present_verb != a: # print " 动词现在时化:{0} -> {1}".format(a,try_present_verb) # else: # print "" return try_present_verb except: #否则继续检查 pass #测试是否是名词 try_singular_noun = en.noun.singular(a) if en.is_noun(try_singular_noun): # if try_singular_noun != a: # print " 名词单数化:{0} -> {1}".format(a,try_singular_noun) # else: # print "" return try_singular_noun #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb( a) or en.is_connective(a): # print "" return a return ''
def dict_ingest(path_to_dict): noun = [] verb = [] adjective = [] adverb = [] miscel = [] f = open(path_to_dict,'r') for l in f: word = l.strip() if en.is_noun(word): noun.append(word) elif en.is_verb(word): verb.append(word) elif en.is_adjective(word): adjective.append(word) elif en.is_adverb(word): adverb.append(word) else: miscel.append(word) print noun[:5] print verb[:5] print adjective[:5] print adverb[:5] print miscel[:5] return noun, verb, adjective, adverb, miscel
def is_a_expression(self, word): return self.is_a_hash_tag(word)\ or self.is_negation(word) \ or en.is_noun(word) \ or en.is_adjective(word) \ or en.is_verb(word) \ or en.is_adverb(word) \ or self.is_orality(word)
def giveNearestEmotion(self, word): if en.is_verb(word): return en.verb.is_emotion(word, boolean=False) if en.is_adverb(word): return en.adverb.is_emotion(word, boolean=False) if en.is_adjective(word): return en.adjective.is_emotion(word, boolean=False) return en.noun.is_emotion(word, boolean=False)
def get_gloss(word): if en.is_verb(word): return en.verb.gloss(word) elif en.is_adjective(word): return en.adjective.gloss(word) elif en.is_adverb(word): return en.adverb.gloss(word) elif en.is_noun(word): return en.noun.gloss(word) else: return en.wordnet.gloss(word)
def get_article(word, tokens, index): article_index = index - 1 if index <= 0: return tokens[0] if not is_noun(word) and not is_adjective(word) and not is_adverb(word): return tokens[article_index] if tokens[article_index] == 'a' or tokens[article_index] == 'an': proper_article = noun.article(word).split()[0] return proper_article return tokens[article_index]
def simplify_word(a): #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) otherwordlist.append(a) #print a return a
def convertVerb(srclst): dstlst = [] itemnew = "" for item in srclst: #print(item) ############################when nos lib give error #if (item.endswith("ed") or item.endswith("ing")) \ if en.is_verb(item) \ and (not en.is_noun(item)) \ and (not en.is_adjective(item)) \ and (not en.is_adverb(item)) \ and (item not in WIERDWORDS): try: itemnew = en.verb.present(item) except: print "unrecognized word:", item itemnew = item else: itemnew = item dstlst.append(itemnew) return dstlst
def simplify_word(a): try:#测试是否为动词,如果是则返回 en.is_verb(en.verb.present(a)) return en.verb.present(a) except:#否则继续检查 pass #测试是否是名词 if en.is_noun(en.noun.singular(a)): return en.noun.singular(a) #如果已经可以判断是名词,动词,形容词,副词,连词 if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a): return a otherwordlist.append(a) return a
def valid_pos(word): if not is_noun(word) and not is_verb(word) and not is_adjective( word) and not is_adverb(word) and len(word) < 7: return False return True
def is_major(word): return en.is_verb(word) or en.is_adjective(word) or\ en.is_adverb(word) or (word in MODAL_VERBS)
def get_frequncy_dist(dir_path): files = os.listdir(dir_path) all_words = 0 words_wt_freq = {} '''get words''' for filename in files: if (filename.endswith('.srt')): file_handler = open(dir_path + '\\' + filename, 'r') for line in file_handler : for word in line.strip().split(): sword = word.strip(punctuation) if (sword.isalpha()): lword = sword.lower() words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1 all_words += 1 file_handler.close() logger.debug('# all words: ' + str (all_words - 1)) logger.debug('# unique words: ' + str (len(words_wt_freq.keys()))) lexical_diversity_for_freq(words_wt_freq.values()) lemmatized_words_wt_freq = {} for word in words_wt_freq.keys(): lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word) if (word != lemmatized_word and lemmatized_word != None): lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word) #print(lemmatized_word, word) else: lemmatized_words_wt_freq[word] = words_wt_freq.get(word) lemmatized_size = len(lemmatized_words_wt_freq.keys()) logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size)) lexical_diversity_for_freq(lemmatized_words_wt_freq.values()) words_wt_freq = {} # Save memory stopwords_en = stopwords.words('english') male_names = names.words('male.txt') female_names = names.words('female.txt') comparative = swadesh.words('en') ignore_list = [] ; ignore_list.extend(stopwords_en) ignore_list.extend(male_names) ignore_list.extend(female_names) ignore_list.extend(comparative) filtered_words = [] out_file = open(dir_path + '\\wfd.csv', 'w') out_file.write ('Word, Type, Frequency \n') for word in lemmatized_words_wt_freq.keys(): if len(word) > 2 and word not in ignore_list: filtered_words.append(word) else: out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words))) ignore_list = [] #save memory '''wordnet has 155k''' usual_words = [] for word in filtered_words: if (len(wordnet.synsets(word)) != 0): usual_words.append(word) else: out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words))) filtered_words = [] # save memory tag_filtered_words_wt_freq = {} words_wt_tags = nltk.pos_tag(usual_words) for (word, tag) in words_wt_tags: if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']): if(en.is_adverb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADV,' + word) elif (en.is_adjective(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADJ,' + word) elif (en.is_verb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('VB,' + word) elif (en.is_noun(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('N,' + word) else: if (tag in ['VBZ', 'NNS']): if word.endswith('s'): new_word = word[:-1] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) #print (word , new_word,tag) elif (tag == 'VBG'): new_word = en.verb.infinitive(word) if new_word != None and word != new_word: tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) elif (tag == 'JJS'): if word.endswith('est'): new_word = word[:-3] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) else: tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print (word,tag) else: out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys()))) lexical_diversity_for_freq(tag_filtered_words_wt_freq.values()) lemmatized_words_wt_freq = {} # save memory usual_words = [] #save memory basic_english_vocab = en.basic.words non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab) non_basic_words_wt_freq = {} for non_basic_word in non_basic_words: non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab) for word in words_in_both: out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys()))) lexical_diversity_for_freq(non_basic_words_wt_freq.values()) tag_filtered_words_wt_freq = {} #save memory fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r') my_words = [word.lower() for line in fh for word in line.strip().split()] fh.close() new_words = set(non_basic_words).difference(my_words) words_in_both = set(non_basic_words).intersection(my_words) for word in words_in_both: out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n') new_words_wt_freq = {} for new_word in new_words: new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys()))) lexical_diversity_for_freq(new_words_wt_freq.values()) sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0)) for (word, frequency) in sorted_words: out_file.write (word + ',lexicon,' + str(frequency) + '\n') out_file.close() return new_words_wt_freq