def __freqs_dict(self, raw_text): t_start = time() print('Making filtered text...') stopset = set(stopwords.words('russian')) ad = AlphabetDetector() tokens = word_tokenize(raw_text) tokens_filtered = [w.lower() for w in tokens if w not in stopset and w not in self.__custom_stopwords and w.isalpha() and len(w) >= self.__min_word_len and ad.is_cyrillic(w)] freqs_tokenized_text = FreqDist(tokens_filtered) freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words)) res_text = '' for item in freqs_most_common.items(): word = item[0] freq = item[1] for i in range(freq): res_text += word + ' ' t_end = time() print("TIME = %.2f s" % (t_end - t_start)) return res_text
def answer_six(): from nltk.book import FreqDist dictionary = FreqDist(moby_tokens) sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True) result = [(v, k) for k, v in sorted_dict if ((v > 2000) & k.isalpha())] return result
def answer_six(): words = [word for word in moby_tokens if word.isalpha()] dist = FreqDist(words) s = pd.Series(data=dist) s = s.sort_values(ascending=False) s = s.where(s > 2000).dropna() return list(zip(s, s.index))[:20]
def extract_monthly_user_CV_and_num_tweets(user_list): user_monthly_tweets = defaultdict(int) cnt_all_tweets = 0 with codecs.open(F_IN, 'r', encoding='utf8') as input_file: # the code loops through the input, collects tweets text for each user into a dict for line in input_file: cnt_all_tweets += 1 line = line.split() user = line[0] #if not user in user_list: # continue if user not in user_monthly_tweets: user_monthly_tweets[user] = defaultdict(list) UTS = long(line[4]) month = datetime.datetime.utcfromtimestamp(UTS).month tweet = line[5:] user_monthly_tweets[user][month] += clean(tweet) if cnt_all_tweets % 100000 == 0: print tweet, clean(tweet) print "Processed %d tweets" % cnt_all_tweets for user in user_monthly_tweets: for MO in user_monthly_tweets[user]: output_file = f_out_list[MO] usr_tweets_json = {} usr_tweets_json['_id'] = str(user) usr_tweets_json['txt'] = [{ el[0]: el[1] } for el in FreqDist(user_monthly_tweets[user][MO]).iteritems()] output_file.write( unicode(json.dumps(usr_tweets_json, ensure_ascii=False)) + '\n')
def answer_five(): from nltk.book import FreqDist dictionary = FreqDist(moby_tokens) sorted_dict = sorted(dictionary.items(), key=lambda x: len(x[0]), reverse=True) w, _ = sorted_dict[0] return (w, len(w))
def answer_eight(): from nltk.book import FreqDist part_of_speech = nltk.pos_tag(moby_tokens) tags = [t for _, t in part_of_speech] dictionary = FreqDist(tags) sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True) return sorted_dict[:5]
def word_frenquency_static(data,rate): #统计每一个rate的词分布 texts=list(data[data['rate'] == rate]['review']) #选取同一个rate的评论 wordlist=[] for text in texts: text=[word for word in text.lower().strip().split()] wordlist.extend(text) fdist=FreqDist(wordlist) return fdist
def answer_four(): from nltk.book import FreqDist token_dict = FreqDist(moby_tokens) new_list = [] for val in token_dict.keys(): if len(val) > 5 and token_dict[val] > 150: new_list.append(val) new_list.sort() return new_list # Your answer here
def answer_three(): from nltk.book import FreqDist import operator token_dict = FreqDist(moby_tokens) sorted_token_dict = sorted(token_dict.items(), key=operator.itemgetter(1)) lst = sorted_token_dict[-20:] lst.reverse() return lst
def answer_four(): from nltk.book import FreqDist token_dict = FreqDist(moby_tokens) res_lis = [] for w in token_dict.keys(): if len(w) > 5 and token_dict[w] > 150: res_lis.append(w) res_lis.sort() return res_lis
def get_most_common_words(string, common_words_number=settings.COMMON_WORDS_NUMBER): """ * Sanitizes and removes all stop words * Returns a list with the most common words """ cleaned_data = remove_stop_words(sanitize_string(string)) common_words = FreqDist(cleaned_data).most_common(common_words_number) return [i[0] for i in common_words]
def get_most_common_words(string, common_words_number=3): """ * Sanitizes and removes all stop words * Returns a list with the most common words """ cleaned_data = sanitize_string(string) cleaned_data = remove_stop_words(cleaned_data) common_words = FreqDist(cleaned_data).most_common(common_words_number) return [i[0] for i in common_words]
def answer_three(): from nltk.book import FreqDist import operator token_dict = FreqDist(moby_tokens) sorted_token_dict = sorted(token_dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_token_dict[0:20] # Your answer here
def answer_five(): from nltk.book import FreqDist token_dict = FreqDist(moby_tokens) max_len = 0 for w in token_dict.keys(): if len(w) > max_len: max_word = w max_len = len(w) tups = (max_word, max_len) return tups # Your answer here
def answer_six(): import operator from nltk.book import FreqDist token_dict = FreqDist(moby_tokens) res_lis = {} for w in token_dict.keys(): if w.isalpha() and token_dict[w] > 2000: res_lis[w] = token_dict[w] sorted_res_list = sorted(res_lis.items(), key=operator.itemgetter(1)) sorted_res_list.reverse() result = [(f, w) for w, f in sorted_res_list] return result
def answer_six(): import operator from nltk.book import FreqDist token_dict = FreqDist(moby_tokens) liss = {} for w in token_dict.keys(): if w.isalpha() and token_dict[w] > 2000: liss[w] = token_dict[w] sortedlist = sorted(liss.items(), key=operator.itemgetter(1), reverse=True) finale = [(f, w) for w, f in sortedlist] return finale # Your answer here
def extract_phrases(corpus, requested_terms=None, term_freq_threshold=1, spell_correction=False): bigram_docs = extract_bigrams(corpus, spell_correction) # consolidate bigrams of the coprus, I did not dissolve everything since individual document bigrams are # also needed in further processing corpus_bigram = list(chain(*bigram_docs)) # freq of all corpus corpus_freq = dict(FreqDist(corpus_bigram).items()) corpus_bigram = None # stitch related phrases in the corpus together corpus_freq = stitch_related_phrases(bigram_docs, corpus_freq, term_freq_threshold) bigram_docs = None # sort the corpus in descending order of the frequency corpus_freq = sorted(corpus_freq.items(), key=lambda x: x[1], reverse=True) corpus_freq = dict(corpus_freq) # initialize stop words stop_words = nltk.corpus.stopwords.words('english') stop_words.extend(" ".join(stop_words).title().split()) stop_words.extend(string.punctuation) stop_words = set(stop_words) phrases = list(corpus_freq) corpus_freq = None # remove stop words phrases = list(map(lambda x: set(x).difference(stop_words), phrases)) phrases = set(map(lambda x: tuple(x), phrases)) if () in phrases: phrases.remove(()) phrases = list(phrases) if requested_terms is not None: phrases = phrases[:requested_terms] # find nouns annotated_data = list( map(lambda x: nltk.pos_tag([*x]) if len(x) > 1 else 'NA', phrases)) noun_terms = list( filter(lambda x: x is not None and len(x) > 1, map(lambda x: filter_nouns(x), annotated_data))) #.translate(translator) return phrases, noun_terms
def exercise1(): print("Part a") answer = {} plural_nouns = [ w.lower() for (w, t) in brown.tagged_words() if t.startswith('NNS') ] #Find the Plural Nouns lemmatizer = nltk.WordNetLemmatizer() lemmatized_singular_nouns = [ lemmatizer.lemmatize(w) for w in set(plural_nouns) ] singular_nouns = [ w.lower() for (w, t) in brown.tagged_words() if t.startswith('NN') if w in lemmatized_singular_nouns ] fdist_pnouns = (FreqDist(plural_nouns)) fdist_lsnouns = (FreqDist(singular_nouns)) for (singular_word, singular_count) in fdist_lsnouns.most_common(100): for (word, count) in fdist_pnouns.most_common(100): if (singular_word == lemmatizer.lemmatize(word) and count > singular_count): answer[word] = count #print("Singular form:%s,Plural Form:%s,Count_Singular:%s,Count_Plural:%s"%(singular_word, word, singular_count, count)) print( sorted(answer.items(), key=operator.itemgetter(1), reverse=True)[0:5]) print("Part b") tags = [t for (w, t) in brown.tagged_words()] fdist_tags = (FreqDist(tags)) print(fdist_tags.most_common(5)) print("Part c") for genre in ['humor', 'romance', 'government']: print("The genre is:", genre) preceeding_tags = [] four_grams = list(ngrams(brown.tagged_words(categories=genre), 4)) for my_list in four_grams: if len(my_list) == 4 and my_list[3][1].startswith('NN'): preceeding_tags.append(my_list[0][1] + " " + my_list[1][1] + " " + my_list[2][1]) print("The most common tags which preceed NN are:") fdist_mctags = (FreqDist(preceeding_tags)) print(fdist_mctags.most_common(5))
def user_tweet_text_2_mongo(f_in, f_out): user_tweets = collect_tweet_text_per_user(f_in) output_file = codecs.open(f_out, 'w', encoding='utf8') for usr in user_tweets.iterkeys(): usr_tweets_json = {} usr_tweets_json['_id'] = str(usr) usr_tweets_json['txt'] = [{ el[0]: el[1] } for el in FreqDist(user_tweets[usr]).iteritems()] output_file.write( unicode(json.dumps(usr_tweets_json, ensure_ascii=False)) + '\n') #print usr_tweets_json print "User with most words had: ", len(max(user_tweets.values(), key=len))
def answer_four(): from nltk.book import FreqDist dictionary = FreqDist(moby_tokens) freqwords = [w for w in dictionary if len(w) > 5 and dictionary[w] > 150] return sorted(freqwords)
def answer_three(): from nltk.book import FreqDist dictionary = FreqDist(moby_tokens) sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True) return sorted_dict[:20]
from nltk.tag import pos_tag s=nltk.pos_tag(nltk.word_tokenize(t)) print(s) #removing verbs from input file file_without_verbs = [word for word,tag in s if tag != 'VBG' and tag != 'VBZ' and tag!='VBN'] z=' '.join(file_without_verbs) # z is the file without verbs print(z) s1=nltk.pos_tag(nltk.word_tokenize(z)) print(s1) # you can see in the output that all the verbs are removed fdist=FreqDist(z) print(fdist) q=fdist.most_common(5) print(q) #word frequency of remaining words def tokens(text): """ Get all words from the corpus """ return re.findall('[a-z]+', text.lower()) WORD_COUNTS = collections.Counter(tokens(z)) print (WORD_COUNTS) print (WORD_COUNTS.most_common(5))
def main(): # Parsing user input parser = ap.ArgumentParser() parser.add_argument('-i', '--input', nargs='?', type=str, required=True, help='Input filename.') parser.add_argument('-c', '--concordance', nargs='?', type=str, default=None, help='Word concordance.') parser.add_argument('-d', '--dispersion', nargs='*', type=str, default=None, help='Word dispersion.') parser.add_argument('-f', '--frequency', nargs='?', type=int, default=None, help='Word frequency.') parser.add_argument('-a', '--acro', action='store_true', help='Acronyms only.') args = parser.parse_args() with open(args.input, 'r') as f: plain = f.read() plain = remove_comments(plain) words = nltk.word_tokenize(plain) if args.acro: words = [w for w in words if is_acro(w)] print '%d unique words out of %d total words.' % (len( set(words)), len(words)) text = nltk.Text(words) if args.concordance is not None: text.concordance(args.concordance) return if args.dispersion is not None: text.dispersion_plot(args.dispersion) return if args.frequency is not None: freq = FreqDist(text) for i, f in enumerate(freq.most_common(args.frequency)): print '%9d%9d %s' % (i, f[1], f[0]) freq.plot(args.frequency)
for classifier in label: g.write(str(classifier) + '\n') else: for line in g: label.append(int(line.strip('\n'))) g.close() big_feature_list = [] for feat in features: big_feature_list.append(' '.join(word for word in feat)) big_feature_list, label = shuffle(big_feature_list, label, random_state=5) label = label.tolist() #data = [features, opening_prices] corpus = [] for document in big_feature_list.tolist(): corpus += document.split() fdist = FreqDist(corpus) #print(fdist.most_common(200)) #s = open('stopwords.txt' , 'w') #for pair in fdist.most_common(500): # s.write(str(pair[0])+'\n') #s.close() xform = count_vect.fit_transform(big_feature_list) xform = preprocessing.scale(xform, with_mean=False) #tform = count_vect.fit_transform(opening_prices) #xform = hstack([xform, tform]) #xform = coo_matrix.tocsr(xform) f.close() e.close() #print(xform) # V = 12 M = count // V
def frequency_distribution(text): # TODO: is there a better way to get a freqdist that doesn't involve import nltk.book? from nltk.book import FreqDist return FreqDist(text)
# Word Tokenization word_lists = [] for sentences in df['sentence_tokenize']: for sentence in sentences: words = word_tokenize(sentence) word_lists.append(words) df['word_tokenize'] = word_lists # Saving word tokenization to file df.to_csv(path+str(j)+'_word.csv', index=False) print('Saved file to disk.') for word_list in word_lists: freq = FreqDist(word_list) # freq.plot(10) freq = FreqDist(word_list) freq_dict = dict(freq) freq_words = list(freq_dict.keys()) frequencies = list(freq_dict.values()) freq_df = pd.DataFrame(list(zip(freq_words, frequencies)), columns=['Word','Freq']) freq_df.to_csv(path+str(j)+'_frequency.csv', index=False) print('Saved file to disk.') # Stopwords and Non Stopwords stop = [] non_stopwords = [] for word_list in word_lists:
if (data['Abstract'][i] != 'N'): s = s + ' ' + data['Abstract'][i] else: data['Abstract'][i] = '' # In[415]: s = re.sub('[^A-Za-z0-9 ]+', '', s) #calculate frequency of word using nltk library from nltk.book import FreqDist #newlist is list of all words newlist = s.split() #normalize capitals- The and the are same for i in range(len(newlist)): newlist[i] = newlist[i].lower() fdist = FreqDist(newlist) #extracting most common i.e. most frequent words remlist = fdist.most_common(10) # print(remlist) rem = remlist for i in range(0, 10): rem[i] = remlist[i][0] # print(rem) # TASK 2- # In[416]: #this will remove frequent 10 words from whole corpus newlist = [w for w in newlist if w.lower() not in rem]
def most_freq_words(self, text, number): word_freq = FreqDist(text) words_counts = word_freq.most_common(number) words = [pair[0] for pair in words_counts] return words
# ◑ Write a function that takes a list of words (containing duplicates) and returns a list of words (with no duplicates) sorted by decreasing frequency. E.g. if the input list contained 10 instances of the word table and 9 instances of the word chair, then table would appear before chair in the output list. from nltk.book import FreqDist words = ['this', 'is', 'my', 'list', 'of', 'list', 'of', 'list', 'is', 'this', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'words'] fdist = FreqDist(words) length = len(set(fdist)) answer = list(fdist.most_common(length)) answer.reverse() answer = [i[0] for i in answer] print(answer)
rate_unique.sort() rate_unique=list(rate_unique) #统计每个rate下,每种词出现的频数,以及all for i in (rate_unique): path="../data/wordFrequency/word_"+str(i)+".json" with open(path, "w", encoding="utf-8") as f: json.dump(word_frenquency_static(data,i), f) #word_all 所有词的 texts=list(data['review']) #选取同一个rate的评论 wordlist=[] for text in texts: text=[word for word in text.lower().strip().split()] wordlist.extend(text) fdist_all=FreqDist(wordlist) path="../data/wordFrequency/word_all.json" with open(path, "w", encoding="utf-8") as f: json.dump(fdist_all, f) #打开尝试调用一下 f=open(path,'r') a=f.read() dict_fdist_all=eval(a) print('ok',dict_fdist_all['ok']) ''' 2.计算词在各类文本中的分布概率,并保存结果 '''