def fun10(): """frequency distribution""" fdist1 = FreqDist(text1) # print fdist1 vocabulary1 = fdist1.keys() # print vocabulary1[:50] fdist1.plot(50, cumulative=True)
def main(): index = get_index("index.data") results = bfs('Obama', 'GAB', index) print_results(results) fdistAB = FreqDist([rel.A() for rel in results] + [rel.B() for rel in results]) fdistAB.plot(10)
def main(): argparser = argparse.ArgumentParser(description='text file') argparser.add_argument('file', type=str, help='file to produce frequency distribution for') args = argparser.parse_args() #toker = WhitespaceTokenizer() f = open(args.file) text = f.read() print(text) fdist = FreqDist(text) print(fdist.freq('28') * 100) fdist.plot()
def testFunc(): fw = open("./MZI/data.doc", "r", encoding="utf8"); text = fw.read(); tockens = getWordList(text) print(len(set(tockens))) from nltk.probability import FreqDist from nltk.util import bigrams fdist = FreqDist(w for w in tockens if len(w) > 1); fdist.tabulate(50); big = list(bigrams(w for w in tockens if len(w) > 1)); print(big[:100]); fdist = FreqDist(str(w) for w in big); fdist.tabulate(10); fdist.plot(50)
def create_enhanced_dale_chall_list(self): #list of sites used to create list of most frequent words alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask'] #bring all privacy texts into one list corpus = [] data = get_all_policies() for site in data: if site in alexa_list: corpus.append(data[site]["text"]) #get the words of this list into a list of words t = textanalyzer("eng") words = t.getWords("".join(corpus)) #open the dale chall wordlist dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';') #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words new_corpus = [] for word in words: if word.lower() not in dale_chall_list and word not in alexa_list: new_corpus.append(word.lower()) #create a frequency distribution of the words of this list of words fdist = FreqDist(new_corpus) #plot this fdist.plot(80, cumulative=True) #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative) most_frequ = [] cum_percentage = 0.0 for sample in fdist: cum_percentage += fdist.freq(sample) most_frequ.append(sample) if cum_percentage > 0.33: break #write those into a file privacy_file = open("privacy_wordlist.txt", "w") privacy_file.write(";".join(most_frequ))
def genre_properties(sorted_genres, data): # create empty genre_tokens dict to hold genre name:token lists genre_tokens = {} # boolean check to see if we have already gone through and tokenized everything files_exist = os.path.isfile("data/top_genres.txt") # initialize the genre_tokens keys outside of the if statement so that it can be used in both cases for i in range(0, 5): # keys are simply the top 5 genres genre_tokens[sorted_genres[i]] = [] # if for some reason the data files don't exist, lets go through the process of creating them (takes ~3 minutes) if not files_exist: print( "\nThe data files don't exist, beginning tokenization process, grab some coffee..." ) # grab the nltk corpus stopwords stopWords = set(stopwords.words('english')) # add in some extra noise words we don't care about (I probably missed a couple) noiseWords = [ "{{Expand section}}", ",", ".", "(", "[", "{", ")", "]", "}", ":", ";", "&", "'", '"', "'s", "``", "''", "n't", "`", '’' ] # store the start time so we can keep track of how long this process takes t1 = time.time() # iterate through the dataset, this is largerly the same structure as in top_genres so I won't repeat comments for row in data.itertuples(index=True): # strip the genre string of quotes and brackets genre_str = str(getattr(row, 'genres')) genre_str = genre_str[1:-1] genre_str = genre_str.replace('"', '') # don't need to do any trimming on summary strings like we did for genre strings summary_str = str(getattr(row, 'summary')) # tokenize the summary string tokens_raw = word_tokenize(summary_str) # create an empty token list we will fill with filtered tokens tokens_processed = [] # filter the raw token list for word in tokens_raw: # we only care about words not in the stopWords or noiseWords list if word not in stopWords and word not in noiseWords: tokens_processed.append(word) # for each of the film's genres... for genre in genre_str.split(', '): # if the genre is in the top 5 genres genre_tokens dict... if genre in genre_tokens: # extend the film's filtered tokens to the end of genre_token's token list for the current genre genre_tokens.get(genre).extend(tokens_processed) # grab the stop time and alert the user of progress t2 = time.time() print("Tokenization completed in " + str(t2 - t1) + " seconds.\n") # to make sure we never have to do that again, lets store all of our data in some .txt files # first lets store the top 5 genres in the file "top_genres.txt", one genre per line top_genres_file = open("data/top_genres.txt", "w") for i in range(0, 5): # grab the genre name genre = sorted_genres[i] # write it to a line with a newline break top_genres_file.write("%s\n" % genre) # using that genre name create a file "genre.txt" where we will store all of that genre's tokens list genre_file = open("data/%s.txt" % genre, "w") # for all of the tokens in that genres value pair from our genre_tokens dict... for token in genre_tokens.get(genre): # write each token on a newline genre_file.write("%s\n" % token) # close our genre file inside the for loop since we will use the same variable for all 5 genre files genre_file.close() # finally close the top_genres file top_genres_file.close() # in this case the data files already exist and we don't need to do any tokenization, this should be the normal case else: print("\nThe data files exist, beginning token loading:") # first open the file with the top 5 genres listed top_genres_file = open("data/top_genres.txt", "r") # iterate over each line in the file for index, line in enumerate(top_genres_file): # initialize the genre_tokens dict with the top 5 genres as keys, and empty lists for tokens as values genre_tokens[sorted_genres[index]] = [] # close the top genres file for memory top_genres_file.close() # iterate over each of the genre keys in our genre_tokens dict that we just loaded for genre in genre_tokens: print("Loading the " + str(genre) + ".txt file...") # open the associated file for each genre genre_file = open("data/%s.txt" % genre, "r") # iterate over each line of the file for index, line in enumerate(genre_file): # trim the new line characters from the line trimmed_line = line.replace("\n", "") # append the line (token) to the corresponding token list for the current genre in our genre_tokens dict genre_tokens.get(genre).append(trimmed_line) # close our files genre_file.close() # we are now done loading in our data files and can proceed with addressing the genre characterization print("Done loading!\n") # at this point, we now have a genre_tokens dict with complete summary tokens lists for each genre # lets create a dict, genre_fdicts, that will store the genre:freq dist pairs for each genre genre_fdists = {} print("Creating frequency distributions for each genre:") # for each of our top genres for genre in genre_tokens.keys(): # print out the genre and number of tokens it has print("Total " + str(genre) + " tokens to consider: " + str(len(genre_tokens.get(genre))) + "...") # calculate the Frequency Distribution of all the genres tokens fdist = FreqDist(genre_tokens.get(genre)) # add the genre:freqdist pair to our genre_fdists dict genre_fdists[genre] = fdist # next lets do some plotting of the top 50 most frequent tokens fig_path = str("plots/%s_fdist.png" % genre) # we only want to handle plotting if for some reason the plots don't exist if not os.path.isfile(fig_path): # alert the user of what's happening since matplotlib allows the user to specify bounds through a GUI print( str(genre) + " FreqDist plot does not exist, creating and displaying it now..." ) # alert the user of how to save the plot so this process is no longer run print( "To skip this process in the future, save the figure as `plots/Genre Name_fdist`" ) # plot the top 50 freq dist samples fdist.plot(50, cumulative=True) print("Done calculating frequency distributions!\n") # now that we have the freq dists for each genre, lets do some more analytics # to begin lets find & store the common set of tokens that is shared between all genre's top 50 freq dist samples common_set = [] print( "Finding the common set of words across all genre frequency distributions:" ) # loop through each of the top genres for i in range(0, 5): # grab the current genre name from our sorted_genres list that was passed into this function genre = sorted_genres[i] # find the top 50 most common samples in our current genres freq dist top_current = genre_fdists[genre].most_common(50) # initialize an empty temporary list that will overwrite our common_set list new_commons = [] print("Now computing common set additions from " + str(genre) + " genre...") # inner loop to compare each genre against every other genre O(n^2), further work should be done to improve this for j in range(0, 5): # if the genre we are comparing against is the current genre then skip it if j is i: continue # grab the genre to compare against's name compare_genre = sorted_genres[j] # grab the compare genre's frequency distribution top_compare_raw = genre_fdists[compare_genre].most_common(50) # since the most_common function returns a tuple (sample, count) lets strip out just the sample # note: there may be some nltk methods to do this, but I couldnt find any in the documentation top_compare_filtered = [] # for all the sample tuples... for sample in top_compare_raw: # grab just the sample name top_compare_filtered.append(sample[0]) # now lets actually compare the current genre's samples against the compare genre's samples for sample in top_current: # if the current sample name is in the top 50 sample names from the compare genre... if sample[0] in top_compare_filtered: # then add it to the new_commons list new_commons.append(sample[0]) # now we need to update the common_set list with samples that aren't already in it for sample in new_commons: # if the sample from new_commons doesn't exist in the common_set list... if sample not in common_set: # then add the sample to common_set common_set.append(sample) # we have no computed a comman set of words shared in some combination across the top genres print("A common set has been found! Across all genres " + str(len(common_set)) + " words are shared, they are:") print(common_set) # now lets find the unique set of words for each genre, this should give us an insight into genre characteristics print("\nComputing the unique sets for each genre...") # for each genre for genre in genre_tokens.keys(): # initialize an empty unique set list unique_set = [] # grab the top 50 most common words for the genre from its frequency distribution top_current = genre_fdists[genre].most_common(50) # go through each of the top 50 words for sample in top_current: # and if any of the top 50 words arent in the common set... if sample[0] not in common_set: # add it to the unique set unique_set.append(sample[0]) # finally for each genre print out that genre's name and unique set print(str(genre) + "'s unique set: " + str(unique_set))
stop_words = stopwords.words('english') #the two most common contractions that aren't in stopwords.words, #as well as words used to denote the sections of a song stop_words.extend(['im','ill','verse','hook','chorus','bridge']) stop_words = set(stop_words) #element removal is faster using set than list word_tokens = nltk.word_tokenize(str(eastlyrics_punct)) word_tokens = [w.lower() for w in word_tokens] allwords = [w for w in word_tokens if w not in stop_words] fdeast = FreqDist(allwords) fdeast.plot(20, cumulative = False) word_tokens = nltk.word_tokenize(str(southlyrics_punct)) word_tokens = [w.lower() for w in word_tokens] allwords = [w for w in word_tokens if w not in stop_words] fdsouth = FreqDist(allwords) fdsouth.plot(20, cumulative = False)
from nltk.probability import FreqDist with open('christ-and-satan.txt') as f: cs_text = f.read() word_list = cs_text.split() first_letter = [word[0] for word in word_list if word[0].isalpha()] letter_dist = FreqDist(first_letter) letter_dist.plot(4,cumulative=True)
# nltk.download('averaged_perceptron_tagger') # Sample # print(nltk.pos_tag(flat_word_token[0:10])) # print(nltk.pos_tag(flat_sent_token[0:10])) ##### Find Frequency Distribution ###### # Find frequency of words fdist_word = FreqDist(words) fdist_word.most_common(50) # Plot Frequency Graph fdist_word.plot(50) # Find frequency of sentence fdist_sent = FreqDist(sents) fdist_sent.most_common(10) # TELLING # Plot Frequency Graph (sentence) fdist_sent.plot(10) # Frequency of (Word) STEMS fdist_stem_word = FreqDist(stems) fdist_stem_word.most_common(50) # Frequency of (Word) LEMMAS fdist_lemmas_word = FreqDist(lemmas) fdist_lemmas_word.most_common(50)
def fdistribution(self, tokenized_words): fdist = FreqDist(tokenized_words) fdist.plot(30, cumulative=False) plt.show()
def FreqDistPlot(data, show=10): fdist1 = FreqDist(data) fdist1.plot(show, cumulative=True)
# -*- coding: utf-8 from nltk.probability import FreqDist from nltk.corpus import PlaintextCorpusReader from nltk.tokenize import LineTokenizer FIRST = 0 END = 150 corpus_root = './data' fileids = 'data_title_sample' wordlists = PlaintextCorpusReader(corpus_root, fileids, sent_tokenizer=LineTokenizer(), encoding='utf-8') tokens = [] for word in wordlists.words() : try : tokens += [ word.lower() ] except : pass fdist = FreqDist(tokens) fdist.plot(FIRST,END) for k,v in fdist.items() : print "{} {}".format(k.encode("utf-8"),v)
def plot_html_results(self, lemmatized_list_by_verb_noun_adj_adv, number_of_cat): fdist = FreqDist(w for w in lemmatized_list_by_verb_noun_adj_adv) fdist.plot(number_of_cat)
# Creating main text object based on the Wall Street Journal corpora # Setting all words to lowercase and removing non-alphabetical entrys myText = [ word.lower() for word in text7 if word.isalpha() ] # Creating text object based on myText, without repetitions myTextSet = set( myText ) # Creating a frequency distribution with myText fdMyText = FreqDist(myText) # Creating histogram, and copying to file, in order of appearance histogram = [ "%s - %s" % ( word, fdMyText[word] ) for word in myTextSet ] fileObj = open("histogram.txt","w") for wordInfo in histogram: fileObj.write("%s\n" % (wordInfo) ) fileObj.close() # Creating sorted list of the most frequent words, to the less frequent words, # of the reuters text and copying to file sortedList = fdMyText.keys() fileObj = open("sortedHistogram.txt","w") for word in sortedList: fileObj.write("%s - %d\n" % (word, fdMyText[word]) ) fileObj.close() # Only showing 50 most frequent words in plot because of limited monitor space fdMyText.plot(50)
Tokens = word_tokenize(dataset) print(Tokens) #No. of tokens in the dataset len(Tokens) #Freq of occurence of distinct elements from nltk.probability import FreqDist fdist = FreqDist() for word in Tokens: fdist[word.lower()] += 1 fdist fdist.plot(20) #-------------------------Stemming---------------------------------------- from nltk.stem import PorterStemmer pst = PorterStemmer() pst.stem("having") #-------------Remove the Stop Words--------------------- import nltk.corpus #Enlisting the stopwords present in English lang stopwords = nltk.corpus.stopwords.words('english') stopwords[0:10] #Getting rid of stopwords
def plot_freq_dist(words, num_words=20): '''Frequency distribution''' fdist = FreqDist(words) fdist.plot(num_words, cumulative=False)
def get10TopKeyWords(self, tabWord): allWordDist = FreqDist(tabWord) allWordDist.plot(20)
return 100 * count / total print(lexical_diversity(text3)) print(lexical_diversity(text5)) print(percentage(4, 5)) print(percentage(text4.count('a'), len(text4))) # %% fdist1 = FreqDist(text1) fdist1 vocabulary1 = fdist1.keys() print(vocabulary1) print(fdist1['whale']) # %% fdist1.plot(50, cumulative=True) # %% list(fdist1.items())[0:5] # %% fdist1.freq('monstrous') # %% # Total number of samples fdist1.N() # %% fdist1 # %%
A = set(allwords) longwords = [w for w in A if len(w) > 12] #单词长度>12的所有单词 print(sorted(longwords)) from nltk.probability import FreqDist, ConditionalFreqDist """ FreqDist: 创建一个所给数据的频率分布 B(): 不同单词的个数 N(): 所有单词的个数 tabulate(20): 把前20组数据以表格的形式显示出来 fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 """ fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) print("不同单词的个数:%d" % fd2.B()) print("所有单词的个数:%d" % fd2.N()) fd2.tabulate(20) #把前20组数据 以表格的形式显示出来 fd2.plot(20) fd2.plot(20, cumulative=True) """ freq('the') #单词the出现的频率 ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异 """ from nltk.corpus import inaugural print(fd2.freq('the')) #单词the出现的频率 cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.word(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot()
raw_data_lyrics, left_on='Track_Name', right_on='Track.Name') del dataset['Track.Name'] dataset['Lyrics'] = dataset['Lyrics'].astype(str) dataset dataset = dataset.drop(dataset.index[[30, 22]]) dataset['Lyrics'] = dataset['Lyrics'].str.lower().replace(r'\n', ' ') dataset['Lyrics'] tokens = dataset['Lyrics'].fillna("").map(nltk.word_tokenize) allWords = [] for wordList in tokens: allWords += wordList fdist = FreqDist(allWords) fdist.plot(30, cumulative=False) plt.show() stop_words_en = set(stopwords.words("english")) stop_words_es = set(stopwords.words("spanish")) punctuations = list(string.punctuation) allWords = [i for i in allWords if i not in punctuations] forbidden = [ 'oh', "'s", 'yo', "'ll", 'el', "'re", "'m", "oh-oh", "'d", "n't", "``", "ooh", "uah", "'em", "'ve", "eh", "pa", "brr", "yeah" ] filtered_sent = [] for w in allWords: if (w not in stop_words_en) and (w not in stop_words_es): filtered_sent.append(w) filter_ = [] for w in filtered_sent:
alice_mask = np.array(Image.open('c:\\Temp\\alice.jpg')) wc = WordCloud(font_path='c:\\windows\\fonts\\NanumGothic.ttf', relative_scaling=0.2, mask=alice_mask, background_color='white', min_font_size=1, max_words=2000).generate_from_frequencies(tmp_data) plt.figure(figsize=(8, 8)) plt.imshow(wc) plt.axis('off') plt.show() # 그래프로 그리기 import matplotlib.font_manager as fm import matplotlib.pyplot as plt import matplotlib # import matplotlib.rc as rc # font_location = 'C:\\Windows\\Fonts\\gulim.ttc' # font_name = fm.FontProperties(fname=font_location).get_name() # matplotlib.rc('font', family='font_name') import nltk plt.figure(figsize=(20, 4)) from nltk.probability import FreqDist g_data4 = FreqDist(data3) g_data4.plot(50)
#******************************************************************************* # Question: Are there differences between word-length frequencies of converted # vs. unconverted requests? # Answer: No # Correlation Coefficient: #******************************************************************************* print('Begin calculating word length frequencies...') cnvtText = ' '.join([item['request_text'] for item in data if len(item['request_text'])>0 and item['requester_received_pizza']==1]) wl1 = [len(word) for word in nltk.word_tokenize(cnvtText) if word.isalpha()] wl1fd = FreqDist(wl1) if graphs == 'yes': wl1fd.plot() ## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 18 print('...Word length frequencies for successful requests have been plotted.') uncnvtText = ' '.join([item['request_text'] for item in data if len(item['request_text'])>0 and item['requester_received_pizza']==0]) wl2 = [len(word) for word in nltk.word_tokenize(uncnvtText) if word.isalpha()] wl2fd = FreqDist(wl2) if graphs == 'yes': wl2fd.plot() ## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 17, 35, 20 print('...Word length frequencies for unsuccessful requests have been plotted.') #*******************************************************************************
sentence_tokenize = sent_tokenize(text) w_tokenize = word_tokenize(text) print(sentence_tokenize) print("\n") print(w_tokenize) #finding freq of each word from nltk.probability import FreqDist fdis = FreqDist(w_tokenize) print(fdis) a = fdis.most_common(2) print(a) #plotting each word and its frequency import matplotlib.pyplot as plt fdis.plot(30, cumulative=True) plt.show() #listing all stopwords from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) print(stop_words) #removing stopwords refined_sent = [] for w in w_tokenize: if w not in stop_words: refined_sent.append(w) print(refined_sent) #Converting ino stem words from nltk.stem import PorterStemmer
len(tokenized_word_without_Stopwords))) # # Frequency Distribution # In[80]: from nltk.probability import FreqDist import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[81]: # Frequency Distribution Plot fdist = FreqDist(tokenized_word_without_Stopwords) print(fdist) fdist.plot(50, cumulative=False) #plot 50 high frequency words plt.figure(figsize=(50, 50)) plt.show() # # POS tagging (Parts of Speech) # In[38]: import nltk #nltk.download('averaged_perceptron_tagger') # In[116]: tagged = nltk.pos_tag(tokenized_word_without_Stopwords) #need to use split() tagged[0:20]
tagged = nltk.pos_tag(tokens) adj = [w for w, t in tagged if 'JJ' in t] return adj # Extracting only 'noun' words train['Noun'] = train['Cleaned_str'].apply(find_noun) text_noun = train['Noun'].apply(' '.join) text_noun = ' '.join(text_noun) text_noun = text_noun.split() # to list form len(text_noun) # 710916 # Frequency of commonly used noun words Freq_words = FreqDist(text_noun) Freq_words.most_common(60) Freq_words.plot(30) # Extracting only 'adjectives' words train['Adjective'] = train['Cleaned_str'].apply(find_adj) text_adj = train['Adjective'].apply(' '.join) text_adj = ' '.join(text_adj) text_adj = text_adj.split() # to list form len(text_adj) # 349119 # Frequency of commonly used adjectives words Freq_words = FreqDist(text_adj) Freq_words.most_common(60) Freq_words.plot(30) # Most frequently used words Freq_words = FreqDist(text_clean)
tokens = [word.replace(',', '') for word in tokens] tokens = [word for word in tokens if ('*' not in word) and \ ("''" != word) and ("``" != word) and \ (word!='description') and (word !='dtype') \ and (word != 'object') and (word!="'s")] print("\nDocument contains a total of", len(tokens), " terms.") token_num = FreqDist(tokens) for pos, frequency in token_num.most_common(20): print('{:<15s}:{:>4d}'.format(pos, frequency)) #POS Tagging tagged_tokens = nltk.pos_tag(tokens) pos_list = [word[1] for word in tagged_tokens if word[1] != ":" and \ word[1] != "."] pos_dist = FreqDist(pos_list) pos_dist.plot(title="Parts of Speech") for pos, frequency in pos_dist.most_common(pos_dist.N()): print('{:<15s}:{:>4d}'.format(pos, frequency)) # Removing stop words stop = stopwords.words('english') + list(string.punctuation) stop_tokens = [word for word in tagged_tokens if word[0] not in stop] # Removing single character words and simple punctuation stop_tokens = [word for word in stop_tokens if len(word) > 1] # Removing numbers and possive "'s" stop_tokens = [word for word in stop_tokens \ if (not word[0].replace('.','',1).isnumeric()) and \ word[0]!="'s" ] token_dist = FreqDist(stop_tokens) print("\nCorpus contains", len(token_dist.items()), \ " unique terms after removing stop words.\n")
from nltk import word_tokenize, Text from nltk.probability import FreqDist tokens = "" with open(u'monte_cristo.txt', 'r', encoding="utf8") as con: contents = con.read() tokens = word_tokenize(contents) processed = Text(tokens) fdist = FreqDist(processed) processed.collocations() print(fdist.most_common(50)) fdist.plot(50)
# Compute the Percentage of Hapax Legomena's Occurrences and the longest in them hapax_legomenas = fdist.hapaxes() # Get the list of words that appeared just once in corpus hapax_legomena_counts = len(hapax_legomenas) # Get the count of them percentage_of_hapax_legomena = (hapax_legomena_counts/no_of_tokens)*100 # Compute percentage print("Percentage of Hapax Legomena Occurrences", percentage_of_hapax_legomena) max_len_happax_legomena = max([len(word) for word in hapax_legomenas]) print("Longest happax Legomena's are", [word for word in hapax_legomenas if len(word) == max_len_happax_legomena]) # Compute the Percentage of dis legomena Occurrences and the longest in them dis_legomenas = [key for key, value in fdist.items() if value == 2] # Get the words that occurred just twice dis_legomena_counts = len(dis_legomenas) * 2 # Get their counts percentage_of_dis_legomena = (dis_legomena_counts/no_of_tokens)*100 # Compute percentage print("Percentage of Dis Legomena Occurrences", percentage_of_dis_legomena) max_len_dis_legomena = max([len(word) for word in dis_legomenas]) print("Longest Dis Legomena's are ", [word for word in dis_legomenas if len(word) == max_len_dis_legomena]) # Plot the r vs Nr graph fdist.plot(50) # Compute the log scaled version of r vs Nr log_rvsNr = {log(key):log(value) for key, value in (fdist.r_Nr()).items() if value!=0} # Plot the graph of log(r) vs log(Nr) plot.plot(log_rvsNr.keys(), log_rvsNr.values(), 'r.') plot.axis([-1, 11, -1, 11]) plot.xlabel('log(r)') plot.ylabel('log(Nr)') plot.title('log(r) vs log(Nr) Brown Corpus') plot.show()
stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) return stemmed_words sw=stem_words(rs) #frequence d'utilisation d'un mot fdist = FreqDist(rs) frequency_frame = pd.DataFrame(fdist.most_common(30), columns=["mots", "frequences"]) # Frequency Distribution Plot import matplotlib.pyplot as plt fdist.plot(30) plt.show() #2 eme etape recuperer les mots les plus utilisés : print(fdist.most_common(30)) dico = {} for key, value in fdist.most_common(50): if key not in dico: dico[key] = [value] else: dico[key].append(value) print (dico)
def print_freq_dist(in_tokens): "plot the frequency distributions for tokens in in_tokens" text = nltk.Text(in_tokens) fdist = FreqDist(text) fdist.plot(100, cumulative=False)
rawt1 = re.sub(r'(www.[a-z]*.[a-z]*)', '', rawt1) # removing digits rawt1 = re.sub(r'[\d]*', '', rawt1) # removing chapter names rawt1 = re.sub(r'(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii)\.[ _a-z:]*', '', rawt1) # removing punctuations rawt1 = remove_punctuation(rawt1) t1_tokenized = word_tokenize(rawt1) counts = Counter(t1_tokenized) print("Number of distinct words " + str(len(counts))) print("Number of tokens " + str(len(t1_tokenized))) print("Number of characters " + str(len(rawt1))) print(t1_tokenized) fdist = FreqDist(t1_tokenized) fdist.plot(30, cumulative=False) plt.show() # In[2]: word_cloud_dict = Counter(t1_tokenized) wordcloud = WordCloud( width=1000, height=1000, background_color='white', stopwords=None).generate_from_frequencies(word_cloud_dict) plt.figure(figsize=(8, 8), facecolor=None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.show() # In[3]:
for w in filtered_sent: root_words.append(ps.stem(w)) if ln_choice == "lem": for w in filtered_sent: root_words.append(lem.lemmatize(w)) # Remove integers only no_integers = [ x for x in root_words if not (x.isdigit() or x[0] == '-' and x[1:].isdigit()) ] # Frequency dstribution of words in text and plotting data fdist = FreqDist(no_integers) fig = plt.figure(figsize=(10, 5)) plt.gcf().subplots_adjust(bottom=0.25) # to avoid x-ticks cut-off fdist.plot(30, cumulative=False, title="Top 30 most common words in cluster {}".format(c)) plt.show() fig.savefig(pathc + "/Most common words in louvain cluster {}_{}_lexicon.pdf". format(c, ln_choice)) # tokenize the text and plot fre dist plots if c_choice == 'k': for c in (sorted(plt_data["l_clusters"].unique())): c_df = plt_data[plt_data.k_clusters == c] small_df = c_df[["paper_id", "title"]] small_df.to_csv(pathcdf + "/papers in k-means cluster {}.csv".format(c), index=False) # Filter the text
def graph_word(): fdist = FreqDist(word_tokenize(read_file)) print(fdist) fdist.plot(101, cumulative=False) plt.show()
from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.probability import FreqDist import matplotlib.pyplot as plt import re as r from nltk.corpus import stopwords text = "Hello Mr Smith how are you doing today? The weather is great and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard" tokenized_text = sent_tokenize(text) text1 = text.lower() freqword = FreqDist(wordtokenize) freqword.most_common(6) freqword.plot(20, cumulative=False) #removing punctuation rptext = r.sub('[^\w\s]+', '', text1) wordtokenize = word_tokenize(rptext) rpwordtoken = word_tokenize(rptext) #stopword #import nltk #nltk.download("stopwords") swords = set(stopwords.words('english')) swords
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ", fDist.N()) print("不同单词数: ", fDist.B()) fDist.plot(10)
count = 0 for category in good_word_dict_2: if (any(map(lambda word: word in sentence, good_word_dict_2[category]))): count += 1 continue if count == 2: sentence_list.append(str(sentence)) data = '\n'.join(sentence_list) words = word_tokenize(data) # Pre-cleaning spread = FreqDist(words) spread.plot(50) for word, freq in spread.most_common(100): print(u'{};{}'.format(word, freq)) # Cleaning words = [w.lower() for w in words if w.isalpha()] stop_words = set(stopwords.words('english')) words_clean = [w for w in words if w not in stop_words] # Post-Cleaning spread = FreqDist(words_clean) spread.plot(50) f = open( "C:\\Users\\satvi\\Documents\\GitHub\\HIselector\\Satvik\\Bag of Words\\relevant_sentences.txt", "w",
def drawFreqMap(words): fdist = FreqDist(words) fdist.plot(20)
content = open(file_name, 'rb').read() cutedText = " ".join(jieba.cut(content)) #nltkText = nltk.corpus.gutenberg.raw(cutedText) fd = FreqDist(cutedText) items = fd.items() print items[:30] #fd.plot() #print cutedText print dir(cutedText) #print dir(nltkText) print cutedText.count(u'ÃÏ¿Ì') tags = jieba.analyse.extract_tags(content, topK=30) fd = FreqDist(tags) for keyword in tags: print "result of ",keyword count = cutedText.count(keyword) print count fd[keyword] = count #cutedText.split().concordance(keyword) print fd from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] plt.xlabel(u'') plt.ylabel(u'¥Œ ˝') plt.title(u'') fd.plot()
def plot_freq_dist(dict, firsts=100, cumulative=False): dist = FreqDist(dict.dfs) dist.plot(firsts, cumulative=cumulative) plt.show()
ztokenizer = nltk.RegexpTokenizer(r"\w+") text_token = ztokenizer.tokenize(df2) # convert to lower case tokens = [w.lower() for w in text_token] # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words sw_list = [ 'months', 'year', 'years', 'com', 'linkedin', 'linkedin', 'comwww', 'india', 'new', 'technology', 'gmail' ] stopword = stopwords.words('english') #this helps in appending the code stopword.extend(sw_list) words = [w for w in words if not w in stopword] #this is graphical presentation of most used words fdist1 = FreqDist(words) # print (fdist1) fdist1.plot(20)
tokens[n] = [word for word in tokens[n] if ('*' not in word) and \ word != "''" and word !="``"] # Remove punctuation for word in tokens[n]: word = re.sub(r'[^\w\d\s]+','',word) print("\nDocument " + str(n) +" contains a total of", len(tokens[n]),\ " terms.") # POS Tagging tagged_tokens = {} for n in range(1,9): tagged_tokens.update({n: nltk.pos_tag(tokens[n])}) pos_list = [word[1] for word in tagged_tokens[n] if word[1] != ":" and \ word[1] != "."] pos_dist = FreqDist(pos_list) pos_dist.plot(title="Parts of Speech: Document "+str(n)) for pos, frequency in pos_dist.most_common(pos_dist.N()): print('{:<15s}:{:>4d}'.format(pos, frequency)) # Remove stop words stop = stopwords.words('english') + list(string.punctuation) stop_tokens={} for n in range(1,9): stop_tokens.update({n:[word for word in tagged_tokens[n] if word[0] not in stop]}) # Remove single character words and simple punctuation stop_tokens[n] = [word for word in stop_tokens[n] if len(word) > 1] # Remove numbers and possive "'s" stop_tokens[n] = [word for word in stop_tokens[n] \ if (not word[0].replace('.','',1).isnumeric()) and \ word[0]!="'s" ]
# def content_fraction(text): # stopwords = nltk.corpus.stopwords.words('spanish') # content = [w for w in text if w.lower() not in stopwords] # return len(content) / len(text) # content_fraction(text) # Step 4: stem words # SOME INITIAL EXPLORATIONS OF THE TEXT sorted(set(text)) # displays sorted unique words fdist = FreqDist(text) # creates a frequency distribution for words vocabulary = fdist.keys() # creates frequency distributions vocabularies vocabulary[:50] # displays 50 most frequent words in text fdist.plot(50, cumulative=True) # frequency distribution for 50 most frequent words text.collocations() # common word collocations # APPROACH 1: POINTWISE MUTUAL INFORMATION (PMI) bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() #quadgram_measures = nltk.collocations.QuadgramAssocMeasures() finder_bi = BigramCollocationFinder.from_words(text) finder_tri = TrigramCollocationFinder.from_words(text) finder_quad = QuadgramCollocationFinder.from_words(text)
print state_union_text.count("war") state_union_text.concordance("economy") state_union_text.similar("economy") state_union_text.common_contexts(["economy", "jobs"]) from nltk.probability import FreqDist fdist = FreqDist(state_union_text) result = fdist.most_common(15) result from nltk.corpus import stopwords stopwords.words("english") filtered = [w for w in state_union.words() if not w in stopwords.words("english")] len(filtered) fdist_filtered = FreqDist(filtered) fdist_filtered.most_common(20) fdist_filtered.freq("good")/fdist_filtered.freq("bad") fdist_filtered.freq("bad")/fdist_filtered.freq("evil") fdist_filtered.plot(30)