def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False): """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn] limit truncates the freq_dist to the limit most common items return_counts determines whether a list of tuples (word, count) are returned, or whether a list of just the limit most used words is returned """ for f in filtering_functions + [str.lower, str.strip]: input = map(f, input) nltk_fdist = FreqDist(list(input)) if plot: #use nltks built in plotting function before destroying the data structure nltk_fdist.plot(limit) if limit else nltk_fdist.plot() fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0])) #alphabetically sort equally counted items fidst = fdist[0:limit] if limit else fdist #apply limit fdist = [i[0] for i in fdist] if not return_counts else fdist #remove counts if desired return fdist
def process_tweets (hashtag,addl_stops=[]): count=0 good_count=0 words_to_plot=[] #Iterate through all chunked files with relevant hashtag for fname in os.listdir(os.getcwd()): if fname.startswith(hashtag): with open(fname,'r') as data_file: data=data_file.read() # Parse raw string since json.load() approach wasn't working data=data.split("\n\x00,") for tweet in data: count+=1 # Tweets have a well-defined structure, so we can parse them # manually (even though the JSON approach would be cleaner) text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1] # Skip tweets that contain Unicode if text.find('\u')>=0: continue else: good_count+=1 # Tokenize and count word frequency, ignoring case words = word_tokenize(text) clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)] words_to_plot=words_to_plot+clean_words #Create frequency histogram of 50 most common words and print summary of activity fdist=FreqDist(words_to_plot) fdist.plot(50) print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed" return words_to_plot
def thong_ke_nganh_nghe_ung_vien(nganh_nghe): from matplotlib import pyplot as bieudo # xử lí, tách ngành nghề bỏ vào list, set list_nganh_nghe = [] for nghe in nganh_nghe: list_nganh_nghe.append( nghe.replace('nhân viên kinh doanh', "Kinh doanh").replace( "nhân viên tư vấn", "Tư vấn").replace("Kinh doanh Kinh doanh bất động sản", "Kinh doanh"). replace("Kinh doanh bất động sản", "Bất động sản").replace( "Kinh doanh Kinh doanh", "Kinh doanh").strip()) # thêm từng ngành nghề vào list set_list_nganh_nghe = set(list_nganh_nghe) # loại bỏ các nghề trùng nhau for nghe in set_list_nganh_nghe: print(nghe) print("----------------------------------------------------------------") print('Thống kê số lượng bài tuyển dụng theo nghành nghề: ') for i in set_list_nganh_nghe: count = list_nganh_nghe.count(i) if count > 100: print(str(i).strip(), ":", count) print( "-------------------------Thống kê ngành nghề--------------------------------" ) print("Có tổng số ngành nghề: ", len(set_list_nganh_nghe)) fdist_nganh_nghe = FreqDist(list_nganh_nghe) #tuần suất xuất hiện fdist_nganh_nghe.plot(20) # vẽ biểu đồ
class Article: def __init__(self, text): self.text = text def tokenize(self): self.tokens = self.txt_to_tokens(self.text) self.freq = FreqDist(self.tokens) def print_frequencies(self, n=20): for key, value in self.freq.most_common(n): print(key, ": ", value) def plot_frequencies(self): self.freq.plot(20, cumulative=False) @staticmethod def txt_to_tokens(text): """Split text and filter words containing only letters""" tokens = [ x for x in filter(lambda x: x.isalpha(), text.lower().split()) ] nltk.download("stopwords") sr = stopwords.words("english") # throwing out 'stop words' return [x for x in filter(lambda x: x not in sr, tokens)]
def visualize_words( text, tokenized_stop_words, cleanup=False, color=None ): # https://www.datacamp.com/community/tutorials/wordcloud-python """Generate both wordcloud and frequency distribution visualizations when passed a list of text strings and a list of stopwords to remove. Both visualizations make use of wordlcoud's ability to process text, so that the word statistics correspond. Show these figures side by side.""" plt.figure(figsize=(18, 6), dpi=80) processed_text = process_text(text, tokenized_stop_words, clean=cleanup) # Tokenize & remove stopwords fdist = FreqDist(processed_text) plt.subplot(1, 2, 1) wordcloud = WordCloud( background_color="white").generate_from_frequencies(fdist) if color == "blue": wordcloud = wordcloud.recolor(color_func=blue_color_func) elif color == "red": wordcloud = wordcloud.recolor(color_func=red_color_func) elif color == "yellow": wordcloud = wordcloud.recolor(color_func=yellow_color_func) elif color == "purple": wordcloud = wordcloud.recolor(color_func=purple_color_func) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.subplot(1, 2, 2) fdist.plot(30, cumulative=False) plt.show() print(fdist.most_common(30))
def plot_freq_dist(texts, n_gram=1, num_words=25): """ Create a plot of frequency distribution of the most common terms found in the documents. Args: texts: string of texts n_gram: default value to one-gram num_words: number of words to be shown, defaulted to 25 """ temp = texts.split(' ') # tokenize texts if n_gram == 2: temp = bigrams(temp) # create a list of bigrams fdist = FreqDist(temp) # FreqDist object # set up plot plt.figure(figsize=(17, 7)) plt.rc('xtick', labelsize=15) plt.rc('ytick', labelsize=15) plt.xlabel('', fontsize=18) plt.ylabel('', fontsize=18) # plot data fdist.plot(num_words)
def wordFreqDist(num, toxic_wordsSW): from nltk import FreqDist # Creating the word frequency distribution wordFreqDist = FreqDist(toxic_wordsSW) wordFreqDist # Plotting the word frequency distribution wordFreqDist.plot(num) return (wordFreqDist)
def word_count(document): words = get_words(document["content"]) stemmer = EnglishStemmer() words = [stemmer.stem(word) for word in words] fdist = FreqDist(words) for word, frequency in fdist.most_common(50): print(u'{};{}'.format(word, frequency)) fdist.plot(30, cumulative=False)
def plot_frequency_distribution(self, n): """ @brief Plots a frequency distribution plot. @param self The object param n The desired number of most frequent words """ assert (isinstance(n, int)) fd = FreqDist(self.tokenized_text) fd.plot(n, cumulative=True)
def words_length_distribution(text): words = del_stopwords(text) plt.ion() fig = plt.figure(figsize=(10, 4)) fig.suptitle("WORDS LENGTH DISTRIBUTION") plt.gcf().subplots_adjust(bottom=0.15) fdist = FreqDist(len(token) for token in words) fdist.plot(30, cumulative=False) fig.savefig('wldis.png', bbox_inches="tight") return fdist.most_common((10))
def words_distribution(text): words = lemmas(text) plt.ion() fig = plt.figure(figsize=(10, 4)) fig.suptitle("WORDS DISTRIBUTION") plt.gcf().subplots_adjust(bottom=0.15) fdist = FreqDist(words) fdist.plot(30, cumulative=False) fig.savefig('wdis.png', bbox_inches="tight") return fdist.most_common((10))
def main(): # класс частного распределения fd_text1 = FreqDist(book.text1) print(str.format('Объект частотного распределения: {}', fd_text1)) print(str.format( '50 наиболее встречаемых слов: {}', fd_text1.most_common(50) )) fd_text1.plot(50, cumulative=True)
def calc_freq_dist(tokens): cleaned_tokens = tokens[:] for token in tokens: if token in stopwords.words('english') or not token.isalpha(): cleaned_tokens.remove(token) #print(set(tokens) - set(cleaned_tokens)) freq = FreqDist(cleaned_tokens) # for key,val in freq.items(): # print(str(key) + ':' + str(val)) freq.plot(20, cumulative=False)
def NAICS_word_freq(): part1 = pd.read_csv('data/Part 1.csv', low_memory=False) labels_str = '' labels = list(set( part1['NAICS.display-label'])) # verified already to be all strings for item in labels: labels_str = item + " " + labels_str words = labels_str.split(" ") freqDist = FreqDist(words) freqDist.plot(10)
def plot_freqdist_from_series(pd_series, tokenizer_obj=default_tk, stop_words_list=gen_stop_words, title='Term Frequency distribution', num_terms=20, figsize=(10, 10), ngram_number=1, lower_case=True): """Function that takes in a Pandas Series or column of a DataFrame and plots the Frequency Distribution of termns within that list of documents. Args: pd_series - either a standalone Pandas Series object or a dataframe column, e.g. df.job_description tokenizer_obj - (obj) a tokenizer object, normally of the NLTK variety num_terms - (int) how many of the top terms to plot on the Freq Dist, default 20 stop_words - (list of str) list of stop words to exclude from final corpus figsize - (tuple of 2 integers) size of matplotlib plot, default is (10,10) ngram_numer - (int) what size ngrams to use, expects 1, 2 or 3. Default is 1. Values outside that list will just return the default. lower_case - (bool) whether to return all words lowercased or not Plot of the Frequency Distribution of the words in the corpus, using NLTK's built in FreqDist function. Returns: f_dist_dict - (dict) ngrams as keys; frequency as value """ all_text_lst = [] for string in pd_series.tolist(): output_txt = '' tokenized_str = tokenizer_obj.tokenize(string) for word in tokenized_str: if ((word.lower() not in stop_words_list) and (word not in stop_words_list)): if lower_case: output_txt += word.lower() + ' ' else: output_txt += word + ' ' else: continue ngram_list = list( nltk.ngrams(output_txt.split(' ')[:-1], n=ngram_number)) for ngram in ngram_list: all_text_lst.append(ngram) f_dist = FreqDist(all_text_lst) f_dist_dict = dict(f_dist) plt.figure(figsize=figsize) plt.title(title) f_dist.plot(num_terms) plt.show() return f_dist_dict
def generar_grafico2(self, lista_datos): import nltk from nltk import FreqDist lista_unica = "" for respuesta_encuesta in lista_datos: for respuesta_pregunta in respuesta_encuesta: for palabra in respuesta_pregunta: lista_unica += palabra + " " tokens = nltk.word_tokenize(lista_unica) fdist = FreqDist(tokens) print(fdist.keys()) print(fdist.values()) fdist.plot(30, cumulative=False)
def fdistByGroup(self, values, title, dataset): for index, row in values.iterrows(): fdist1 = FreqDist(row['tokens']) plt.ion() fdist1.plot(25, cumulative=False, title=(title + " (" + index + ") " + dataset)) plt.tight_layout() plt.savefig(self.mkFileName("wfdist" + dataset + " " + index)) plt.ioff() plt.close()
def fdistCumulative(self, values, title, dataset): allContents = [] for wordList in values['tokens']: allContents += wordList fdist1 = FreqDist(allContents) plt.ion() fdist1.plot(25, cumulative=False, title=title + " " + dataset) plt.tight_layout() plt.savefig(self.mkFileName("wfdist" + dataset)) plt.ioff() plt.close()
def text_show(words_lists): """ 文本分析 """ freq = FreqDist(words_lists) for key, val in freq.items(): print(str(key) + ':' + str(val)) #可视化折线图 freq.plot(20, cumulative=False) #可视化词云 words = ' '.join(words_lists) wc = WordCloud().generate(words) plt.imshow(wc, interpolation='bilinear') plt.axis("off") plt.show()
def create_freqdist_plot(artist, songs): wordlist = [] for song in songs: try: s = get_lyrics(get_artist("eminem"), get_song(song)) s = s.split("\n") s = "".join(s) s = s.split(" ") wordlist.extend(s) print("Got lyrics for ", song) except: print("Error getting lyrics for ", song) continue plt.figure(figsize=(19, 10)) freqDist = FreqDist(wordlist) freqDist.plot(50)
def analyze(inputfile): file = open(inputfile, "rt") text = file.read() file.close() # split into words tokens = word_tokenize(text) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] junk_words = ['nt'] words = [w for w in words if not w in junk_words] print(words[:100]) freqDist = FreqDist(words) words = list(freqDist.keys()) print(freqDist.plot(50))
def program(pages): final_list = [] cleaned_list = [] for p in pages: link = f"https://www.indeed.com/jobs?q={job}&l={city}{p}" page = requests.get(link) document = page.text soup = BeautifulSoup(document, 'html.parser') description = soup.find_all("div", {'class': 'summary'}) for i in description: text = i.text.strip() final_list.append(text) for f in final_list: tokens = tokenizer.tokenize(f) clean = [w for w in tokens if not w in stop_words] lowercase = [w.lower() for w in clean] filtered_words = [ x for x in lowercase if x not in job and x not in city ] cleaned_list += filtered_words fdist1 = FreqDist(cleaned_list) stop = timeit.default_timer() print('Time: ', stop - start) return fdist1.plot( 20, title= f"Top 20 keywords for {job} jobs on indeed.com\nCity: {city}\nnumber of postings={len(pages)*10}" )
def freq(tokens, n=None): ''' This function takes a list of tokens and returns a list of the top n most frequent tokens It also prints a frequency distribution of the top 50 tokens ''' fdist2 = FreqDist(tokens) fdist2.plot(50, cumulative=True) [i[0] for i in fdist2.items()[:20]] if n is None: print fdist2.items()[:20] return [i[0] for i in fdist2.items()[:20]] else: print fdist2.items()[:n] return [i[0] for i in fdist2.items()[:n]]
def getFrequency(moviename, subfilepath): file = open(subfilepath, "rt") text = file.read() file.close() # split into words tokenized = word_tokenize(text) # remove uppercase words, usually sounds tokens = [] for token in tokenized: if token.isupper(): print(token) else: tokens.append(token) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] #until I implement my own tokenizer, exclude contractions # junk_words = ['nt', 'na', 'gon', 'won'] junk_words = ['nt', 'na', 'gon', 'won', 'got', 'get', 'go', 'la'] words = [w for w in words if not w in junk_words] # test print first 100 words # print(words[:100]) freqDist = FreqDist(words) words = list(freqDist.keys()) # print(freqDist.plot(10)) fig = plt.figure(figsize=(10, 4)) plt.gcf().subplots_adjust(bottom=0.15) # to avoid x-ticks cut-off plt.xlabel('words', fontsize=18) plt.ylabel('times said', fontsize=16) fdist = FreqDist(freqDist) fdist.plot(10, cumulative=False, title="Most Frequently Used Words in " + moviename) plt.show() # fig.suptitle('test title', fontsize=20) fig.savefig(moviename + '.png', bbox_inches="tight")
def main(): obj = TweetFilter() with open("NBA_Warriors.txt", "r") as myFile: data = myFile.read().replace('\n',' ') data = unicode(data, 'utf-8') # This tokenizes each of the word in data tokenz = word_tokenize(data) # This passes the tokenz to the filter function newTokenz = obj.filter(tokenz) # Run a frequency distribution on the entire word list fdist1 = FreqDist(newTokenz) # Plots the top 30 words fdist1.plot(30, cumulative=False)
def q5(cleaned_corpus_tokens, x): # Construct a frequency distribution over the lowercased tokens in the document #fd_doc_tokens = ... fd_doc_tokens = FreqDist(cleaned_corpus_tokens) # Find the top x most frequently used tokens in the document #top_tokens = ... top_tokens = fd_doc_tokens.most_common(x) # Produce a plot showing the top x tokens and their frequencies #... fd_doc_tokens.plot(x) # Return the top x most frequently used tokens return top_tokens
def fenci(data): data = re.compile(r'\s+').sub('', data) data = re.compile(r'\!\[.*?\]\(.*?\)').sub('', data) seg_list = jieba.cut(data) # 基于TF-IDF算法的关键词抽取 tags = jieba.analyse.extract_tags(data, topK=50) print ','.join(tags) # 基于TextRank算法的关键词抽取 tags2 = jieba.analyse.textrank(data, topK=50) print ','.join(tags2) fdist = FreqDist([seg for seg in seg_list]) fdist.plot(50)
def q3(corpus, list_of_files, x): corpus_tokens = [] # Get a list of all tokens in the corpus corpus_tokens = get_corpus_tokens(corpus, list_of_files) # Construct a frequency distribution over the lowercased tokens in the document #fd_doc_tokens = ... fd_doc_tokens = FreqDist(corpus_tokens) # Find the top x most frequently used tokens in the document #top_tokens = ... top_tokens = fd_doc_tokens.most_common(x) # Produce a plot showing the top x tokens and their frequencies #... fd_doc_tokens.plot(x) # Return the top x most frequently used tokens return top_tokens
def plot(tokens: Iterable[str], *, first_n: int=None, omit: Iterable[str]=None): if omit is None: omit = [] omit.extend(stopwords.words("english")) def is_clean(s: str): return all([ len(s) > 2, s.isalpha(), s not in omit, ]) clean_tokens = filter(is_clean, tokens) freq = FreqDist(clean_tokens) if first_n is None: freq.plot() else: freq.plot(first_n)
def lemmas_distribution_rus(dist): dict_file = codecs.open('literature/processed_vocabulary',encoding='utf-8') dict_text = dict_file.readlines() dict_file.close() dict_dict = {} import pymorphy2 morph = pymorphy2.MorphAnalyzer() from collections import defaultdict lemmas_dist = defaultdict(int) for line in dict_text: line_list = line.split(':') dict_dict[line_list[0]] = line_list[1] for word in dist.keys(): if word in dict_dict: lemmas_dist[dict_dict[word]] += 1 else: p = morph.parse(word) if len(p) > 0: print word print p[0].normal_form lemmas_dist[p[0].normal_form] += 1 print lemmas_dist[p[0].normal_form] lemmas_dist = FreqDist(lemmas_dist) lemmas_dist.plot(100)
class CountFreq(object): def __init__(self, *args, **kwargs): self.txt_file = codecs.open('new1.txt', encoding='utf-8') self.stop_words = stopwords.words('english') self.clean_words = [] self.loose_words = loose_words def clean_text(self): ''' this method will clean all the data in new1.txt as well as transfer the data from a text file to a tokenized format that will be readily available for nltk to work with. :return: sanitized and tokenized words. ''' stop = self.stop_words text = self.txt_file for lines in text: clean_words = [word for word in lines.lower().split() if word not in stop] self.clean_words.append(clean_words) self.clean_words = [val for sublist in self.clean_words for val in sublist] return self.clean_words def word_freq(self): ''' single word frequency without any context. This will result in the top 100 words that will be shown and identified as the most repeated words. However, rigorous filtration will be applied to the printed words getting rid of words that are not Nouns :return: the frequency distribution, obj. ''' classified_text = pt(self.clean_words) noun_descriptor = [word for word, pos in classified_text if pos == 'NN'] revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words] self.fdist = FreqDist(revised_noun_descriptor) return self.fdist def graph_freq(self, cumulative): ''' :param cumulative: Boolean value, when true it graphs the cumulative text score producing a diminishing return graph :return: a matplotlib graph ''' return self.fdist.plot(100, cumulative=cumulative)
def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$' stem, suffix = re.findall(regexp, word)[0] return stem def lexical_diversity(text): return len(text) / len(set(text)) nostop_title = lemma(remove_stopwords(text_title)) nltk.Text(nostop_title).collocations() # Frequency distribution of text fdist_title = FreqDist(nostop_title) fdist_title.most_common(50) fdist_title.max() fdist_title.plot(50, cumulative=True)#plot fdist_title.plot(50) total_words = len(set(nostop_title)) print("The total number of words in title of KD is: " + str(total_words)) avg_words = fdist_title.N()/total_words print("Each word appears in title of KD is: " + str(int(avg_words))) # process for text f = open('kdtext.txt', encoding="latin-1") raw_text = f.read() # type type(raw_text) tokens = word_tokenize(raw_text) type(tokens) len(tokens)
import json from textstat.textstat import textstat from nltk import FreqDist from matplotlib.pyplot import * filename = 'bieber-raw-test.json' READ = 'rb' TEXT=1 stopwords = open('stopwords',READ).read().splitlines() tweets = json.load(open(filename,READ)) #Identify retweets words = ' '.join([tweet['text'] for tweet in tweets]).split() fdist = FreqDist(words) fdist.plot(20) tight_layout()
len(all_words) without_stopwords = [w for w in all_words if not w in stop_words] len(without_stopwords) words_len = [w for w in without_stopwords if not len(w) < 5] len(words_len) freq_dist = FreqDist([word for word in words_len]) plt.figure(figsize=(12, 6)) plt.title(f'Frequency Distribution (Insincere Questions, Top 50 words)') freq_dist.plot(50, marker='|', markersize=20) plt.show() from wordcloud import WordCloud wordcloud = WordCloud( background_color='White').generate_from_frequencies(freq_dist) plt.figure(figsize=(12, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() """## Splitting dataset into train and test""" from sklearn.model_selection import train_test_split train_X, test_X, train_y, test_y = train_test_split(X, y,
## MAYBE USE THIS? # remove small words # elected not to use this finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 2) scored = finder2.score_ngrams(bigram_measures.raw_freq) for bscore in scored[:20]: print(bscore) # need to stem, but realy only want to stem "horse" and "horses" # First list the top 50 words by frequency (normalized by the length of the document) bbDist = FreqDist(sbbWords) bbDist2 = DictionaryProbDist(bbDist, normalize=True) bbDist2.prob('black') bbDist2.prob('horse') bbDist.plot(50) # need to make second number number / len(sbbWords) bbItems = bbDist.most_common(50) # Show the normalized probability for item in bbItems: print(item) # King of the Wind Frequency Distribution kwDist = FreqDist(skwWords) kwDist2 = DictionaryProbDist(kwDist, normalize=True) kwDist2.prob('said') kwDist2.prob('agba') kwDist.plot(50) # need to make second number number / len(skwWords) kwItems = kwDist.most_common(50) for item in kwItems:
def plot_frequency_distribution(text, number_of_words): freq_dist = FreqDist(text) freq_dist.plot(number_of_words) plot_freqdist_freq(freq_dist, number_of_words)
print count_vect.vocabulary_.get(u'algorithm') #text classification algorithm clf = SGDClassifier().fit(X_train_tfidf, train_labels) # clf = svm.SVC().fit(X_train_tfidf, train_labels) # clf = svm.SVC(kernel='linear', probability=True, class_weight='auto').fit(X_train_tfidf, train_labels) #ubah data test ke bentuk vector tfidf X_new_counts = count_vect.transform(test_gotg) X_new_tfidf = tfidf_transformer.transform(X_new_counts) #prediksi data test predicted = clf.predict(X_new_tfidf) #print label data test score_arr = [] for category in predicted: print category score_arr.append(category) #cek akurasi X_old_counts = count_vect.transform(train_sentences) X_old_tfidf = tfidf_transformer.transform(X_old_counts) predicted_train = clf.predict(X_old_tfidf) # print 'Akurasi:' # print np.mean(predicted == test_labels) score_fd = FreqDist(score_arr) score_fd.plot(cumulative=False) # for doc, category in zip(docs_new, predicted): # print('%r,%s' % (doc, predicted))
# We can find the FIRST position of given word: myText.index('about') # Frequency distribution from nltk import FreqDist fdist1 = FreqDist(text1) vocabulary = fdist1.keys() frequencies = fdist1.values() fdist1['whale'] fdist1.plot(20) fdist1.plot(20, cumulative = True) # List comprehension # Counting the number of characters in each word in a text [len(w) for w in text1] # Bigram function returns a list of bigrams from nltk import bigrams, trigrams bigrams(myText2) trigrams(myText2) bigramsText1 = bigrams(text1) # bigramsText1[0] is the tuple containing the first bigram
from nltk.corpus import brown brown.words() # Find the frequency of each word in a text fd = FreqDist(brown.words()) # Find the most frequent words in a text: # http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary import operator max(fd.iteritems(), key=operator.itemgetter(1)) sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10] # Or use the wrapper function fd.most_common(10) # plot the most frequent words fd.plot(10) fd.plot(10, cumulative=True) # See the words with lowest frequency (these words are called hapaxes) fd.hapaxes() # Count all the words len(text1) # count unique words len(set(text1)) # count unique words, irrespective of word case len(set(w.lower() for w in text1)) # Find the words that are more than 15 characters long words = set(brown.words())
def freq(remstop): fdist2 = FreqDist(remstop) x = fdist2.items()[:20] fdist2.plot(50, cumulative=True)
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import FreqDist fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成 fdist.inc(sample) # sampleで指定されたデータの数を1増やす fdist['データ'] # 指定されたデータの出現数 fdist.freq('データ') # 指定されたデータの頻度 fdist.N() # サンプルの総数 fdist.keys() # 頻度の順にソートされたサンプル for sample in fdist: # 頻度の順にサンプルをイテレート pass fdist.max() # 数の最も多いサンプル fdist.tabulate() # 頻度分布を表形式で表示 fdist.plot() # 頻度分布をプロット fdist.plot(cumulative=True) # 累積頻度をプロット fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
def _expt0_answerhist(): hist = answerhist() fdist = FreqDist(hist) fdist.plot(50)
# soup = BeautifulSoup(open(path)) # chapter = soup.findAll(text=True)[0] file = open(path) chapter = file.read() chapter_tuple = (chapter, 'real') words = [ w.lower() for w in word_tokenize(chapter) ] real_chapters.append(chapter_tuple) real_words.extend(words) word_total = len(real_words) harry_total = real_words.count('harry') fd = FreqDist(real_words) fd.plot(26) # filtered_real_words = [ w.lower() for w in real_words if w.isalpha() ] filtered_real_words = [ w for w in real_words if w.isalpha() and w not in stop ] Rowling = filtered_real_words fd = FreqDist(filtered_real_words) fd.plot(26) file = open('ao_hp_stories.jl') ao_chapters = [] ao_words = [] AO3 = [] AO3_normed = []
preprocessedStory = preprocess(storytext) tokens = nltk.word_tokenize(preprocessedStory) print tokens[0:20] len(tokens) stop = stopwords.words('english') remstop = [i for i in tokens if i not in stop] remstop[0:20] len(remstop) # 5810 tokens --> 2670 without stopwords fdist2 = FreqDist(remstop) print(fdist2) fdist2.most_common()[:20] fdist2.plot(50, cumulative=True) # Turn it into an nltk text object SpeechText = nltk.Text(tokens) SpeechText.concordance('america', lines=47) SpeechText.concordance('negro', lines=38) SpeechText.concordance('nation', lines=38) SpeechText.concordance('white', lines=38) SpeechText.concordance('negroes', lines=38) SpeechText.concordance('struggle', lines=38) SpeechText.concordance('justice', lines=38) SpeechText.concordance('problems', lines=38) SpeechText.concordance('freedom', lines=38) SpeechText.concordance('rights', lines=38)
def create_dist(nltk_text, stopwords): dist = FreqDist(w.lower() for w in nltk_text if len(w)>=3 and w.isalnum() and w.lower() not in stopwords) dist.plot(50) print "Number of wordforms"+str(len(dist)) return dist
from nltk.corpus import words from nltk import ConditionalFreqDist as CFreqDist , FreqDist cfd = CFreqDist([(w[0] , len(w)) for w in words.words()]) cfd.plot() fd = FreqDist([w[0] for w in words.words()]) fd.plot()
# Word and sentence tokenization tokenized_sentences = sent_tokenize(webtext.raw(file_path)) #tokenized_words = reduce(operator.concat, [word_tokenize(s) for s in tokenized_sentences]) tokenizer = RegexpTokenizer(r'\w+') stop = stopwords.words('english') + list(string.punctuation) raw_tokens = tokenizer.tokenize(webtext.raw(file_path).lower()) tokens = [i for i in raw_tokens if i not in stop] # Convert to nltk text text = Text(tokens) # Freq dist fdist = FreqDist(text) fdist.plot(num_of_words_to_plot, cumulative = False) scarlet_commons = [word for word, counts in fdist.most_common(num_of_words_compare)] print('Most common words for Study in Scarlet:\n', fdist.most_common(num_of_words_to_plot), '\n') # Moby Dick frequencies moby_raw_text = gutenberg.raw(moby_file_name) moby_tokens = tokenizer.tokenize(moby_raw_text.lower()) moby_text = Text([w for w in moby_tokens if w not in stop]) fdist_moby = FreqDist(moby_text) moby_commons = [word for word, counts in fdist_moby.most_common(num_of_words_compare)] print('Most common words for Moby Dick:\n', fdist_moby.most_common(num_of_words_compare)) # Frequencies comparison ## In scarlet but not in moby diff_scarlet_vs_moby = [word for word in scarlet_commons if word not in moby_commons] print('=============\nIn Study in scarlet, but not in Moby Dick: ', ', '.join(diff_scarlet_vs_moby))
# lemma def lemma(text): lmtzr = WordNetLemmatizer() return [lmtzr.lemmatize(w) for w in text] nostop_title = lemma(remove_stopwords(text_title)) # check the collocations of text nostop_title = nltk.Text(nostop_title) nostop_title.collocations() fdist_title = FreqDist(nostop_title) # Frequency distribution of text fdist_title.most_common(50) # most common 50 fdist_title['science'] # return count of a given word fdist_title.max() # max counts fdist_title.plot(50, cumulative=True) # plot fdist_title.plot(50) fdist_title.tabulate(50) # tabulate total_words = len(set(nostop_title)) print("The total number of words in title of dsc is: " + str(total_words)) avg_words = fdist_title.N() / total_words print("Each word appears in title of dsc is: " + str(int(avg_words))) # bigrams, trigrams from nltk import bigrams from nltk import trigrams word_pair = list(bigrams(nostop_title)) word_triple = list(trigrams(nostop_title)) bigrams_title = FreqDist(word_pair) trigrams_title = FreqDist(word_triple) bigrams_title.most_common(50)
word_len = [len(w) for w in text1] print word_len # Example Description # fdist = FreqDist(samples) create a frequency distribution containing the given samples # fdist[sample] += 1 increment the count for this sample # fdist['monstrous'] count of the number of times a given sample occurred # fdist.freq('monstrous') frequency of a given sample # fdist.N() total number of samples # fdist.most_common(n) the n most common samples and their frequencies # for sample in fdist: iterate over the samples # fdist.max() sample with the greatest count # fdist.tabulate() tabulate the frequency distribution # fdist.plot() graphical plot of the frequency distribution # fdist.plot(cumulative=True) cumulative plot of the frequency distribution # fdist1 |= fdist2 update fdist1 with counts from fdist2 # fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2 fdlist = FreqDist(len(w) for w in text1) print dict(fdlist) print fdlist.most_common(3) print fdlist.max() print fdlist[2] print fdlist.tabulate() fdlist.plot() fdlist.plot(cumulative=True)
import matplotlib import string exclude = set(string.punctuation) with open("YT_Comment_Output.txt", "rb") as f: lines = [line.rstrip() for line in f] splits = [line.split() for line in lines] some_upper = [item for sublist in splits for item in sublist] #replace BOM w known stopword BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper] punct_gone = [] for word in BOM_gone: punct_gone.append(''.join(ch for ch in word if ch not in exclude)) YT_comment_words = [word.lower() for word in punct_gone] with open('stopwords.txt', 'rb') as f: stopwords = [line.rstrip() for line in f] print YT_comment_words[:10] print stopwords[:10] filtered_words = [w for w in YT_comment_words if not w in stopwords] print filtered_words[:10] fd = FreqDist(filtered_words) print fd.values()[:10] print fd fd.plot(30)
# Getting the frequency distribution of individual words in the reviews fd = FreqDist() for word in cleaned_reviews: fd[word] += 1 # Examining the top 5 most frequent words fd.most_common(5) # In[ ]: # Plotting the top 50 most frequent words plt.figure(figsize = (10, 5)) fd.plot(50) plt.show() # ### Observations # Of the 50 most frequent words across customer reviews, six reveal food preferences: **chicken, biryani, veg, pizza, rice, paneer**. The only negative word in the top 50 is "bad". # # Factors contributing to restaurant experience are mentioned in the following (descending) order of frequency: place > taste > service > time > ambience > staff > quality > delivery > menu > quantity > friendly. # # Now let us repeat the analysis on a bi-gram level. Bi-grams are pairs of words which can provide better context than individual words. # In[ ]: # Generating bigrams from the reviews bigrams = bigrams(cleaned_reviews)
# 或者使用 Counter 来实现 from collections import Counter print(Counter(fd).most_common(5)) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)] # 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the. # 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系. # 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k); # f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数. # 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数 # 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" . # 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现, # 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系 ################################################################## ## 使用 NLTK 对齐普夫定律进行作图 import matplotlib.pyplot as plt fd = FreqDist(gutenberg.words(gutenberg.fileids())) # 统计 Gutenberg 中每个词例数量 print(fd) # <FreqDist with 51156 samples and 2621613 outcomes>; 5166 个非重复, 2621613 个 token fd.plot(50, title='hello', cumulative=True) # 累加 fd.plot(50) # 前 50 对应的出现次数 ## 传统 matplotlib 方法, 和上面对比, 可以使用 loglog() freqs = [] # 初始化两个空列表来存放词序和词频 for word, rank in fd.most_common(500): freqs.append(rank) # 计算排名前 500 的词的出现次数 plt.subplot(2, 1, 1); plt.plot(range(500), freqs) plt.subplot(2, 1, 2); plt.loglog(range(500), freqs) # 在 log-log 图中展示词序和词频的关系 plt.xlabel('rank(r)', fontsize=14, fontweight='bold') plt.ylabel('frequenly(f)', fontsize=14, fontweight='bold') plt.grid(True) plt.show()
# We can find the FIRST position of given word: myText.index('about') # Frequency distribution from nltk import FreqDist fdist1 = FreqDist(text1) vocabulary = fdist1.keys() frequencies = fdist1.values() fdist1['whale'] fdist1.plot(20) fdist1.plot(20, cumulative=True) # Finding the really long words (using Pythons list comprehension): V = set(text1) [w for w in V if len(w) > 15] # Note that the variable w in the list comprehension is just a dummy. The following gives the same result [whatever for whatever in V if len(whatever) > 15] # Finding the long words (more than seven letters) that appear more than seven times [w for w in V if len(w) > 7 and fdist1[w] > 7] # Counting the number of characters in each word in a text
data = (('woman', ('this', 'bought')), ('man', ('this', 'looked'))) c = CFD(data) # 返回FreqDist object print c['woman'] # 下面两个statements是等价的 print c['woman'][('this', 'bought')] print c['woman'].freq(('this', 'bought')) # 尝试访问不存在的condition print c['a'] # 利用相同的数据测试在FD下的结果 f = FD(data) print f # 尝试没有提供数据时的初始化结果 print CFD() #=============================================================================== # 测试plot #=============================================================================== import matplotlib import matplotlib.pyplot as plt f.plot(cumulative=True)