def print_results(self): print print("int-binned log-likelihood distributions:") ll_fdist = FreqDist(self.res["dlist_dist"]) ll_fdist.tabulate() print print(self.res["cm"]) print("{:<30}{:>.3%}" .format("Majority Class Prior Prob: ", self.res["prior_probability"])) print("{:<30}{:>}" .format("Majority Class Label: ", self.majority_label)) print print("{:<30}{:>.3%}" .format("Accuracy: ", self.res["accuracy"])) print("{:<30}{:>.3%}" .format("Error: ", self.res["error"])) print("{:<30}{:>.3%}" .format("Error Reduction / Baseline: ", self.res["error_reduction"])) print print("{:<7}{:<23}{:>.3%}" .format(self.root_star, "Precision: ", self.res["root_star_precision"])) print("{:<7}{:<23}{:>.3%}" .format(self.root, "Precision: ", self.res["root_precision"])) print("{:<7}{:<23}{:>.3%}" .format(self.root_star, "Recall: ", self.res["root_star_recall"])) print("{:<7}{:<23}{:>.3%}" .format(self.root, "Recall: ", self.res["root_recall"])) print print("{:<30}{:>.3%}" .format("Macro Precision: ", self.res["macro_precision"])) print("{:<30}{:>.3%}" .format("Macro Recall: ", self.res["macro_recall"])) print print("Top Ten Rules:") for l in self.decision_list[:10]: print("{:<30}{:>.4}".format(l[0], l[1])) print print("3 Correct:") for l in self.res["correct"][:3]: print("Correctly Predicted: {} \n Rule: {}, log-likelihood: {} \n {}" .format(l[0], l[2][0], l[2][1], " ".join(l[3]))) print print("3 Incorrect:") for l in self.res["incorrect"][:3]: print("Predicted: {}, was actually: {} \n Rule: {}, log-likelihood: {} \n {}" .format(l[0], l[1], l[2][0], l[2][1], " ".join(l[3])))
def analyze_comments(projects): words = [] word_counts = [] word_counts_en = [] word_counts_en_no_stop = [] nr_projects_with_comments = 0 for project in projects: project_words = project.get_comment_words() if len(project_words) != 0: words.extend(project_words) project_word_counts, project_word_counts_en, project_word_counts_en_no_stop = project.get_comment_word_counts( ) word_counts.extend(project_word_counts) word_counts_en.extend(project_word_counts_en) word_counts_en_no_stop.extend(project_word_counts_en_no_stop) if len(project_word_counts) != 0: nr_projects_with_comments = nr_projects_with_comments + 1 print(word_counts_en_no_stop) get_comment_statistics(word_counts, word_counts_en, nr_projects_with_comments) fdist = FreqDist(words) fdist.most_common(20) fdist.plot(20, cumulative=False) plt.savefig("WordFreqDist.png") plt.savefig("WordFreqDist.pdf") plt.show() fdist.tabulate() return word_counts_en_no_stop
def plot_freq(self, corpus, patt, n): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() words = [] for id in fileids: words = append(words, wordlists.words(id)) fre = FreqDist(word.lower() for word in words if word.isalpha()) fre.tabulate(n) return fre.plot(n)
def cut_sentence(): raw_str = open(r'D:\nltk_data\corpora\gutenberg\austen-persuasion.txt','r').read() #raw_str = ' Professional -Self Healing Cutting Mat is a cut above the rest! :) Alvin Professional -Self Alvin Professional -Self' cutted = WordPunctTokenizer().tokenize(raw_str) # w = [] # for x in cutted: # if x not in stopwords.words('english'): # w.append(x) #filtered = [w for w in cutted if (w not in stopwords.words('english')] #print(w) #print(cutted) col = FreqDist(cutted) #col.plot() col.tabulate()
def testFunc(): fw = open("./MZI/data.doc", "r", encoding="utf8"); text = fw.read(); tockens = getWordList(text) print(len(set(tockens))) from nltk.probability import FreqDist from nltk.util import bigrams fdist = FreqDist(w for w in tockens if len(w) > 1); fdist.tabulate(50); big = list(bigrams(w for w in tockens if len(w) > 1)); print(big[:100]); fdist = FreqDist(str(w) for w in big); fdist.tabulate(10); fdist.plot(50)
def Freq_Dist(text): tokens = normalize_tokenize(text) wubDict = {} for word in tokens: if word in wubDict: wubDict[word] = wubDict[word] + 1 else: wubDict[word] = 1 len(wubDict) == len(set(tokens)) if sum(wubDict.values()) == len(tokens): print("Our Dictionary and our Tokens match!") else: print( 'We have an error in the length of the dictionary and the tokens.') from nltk.probability import FreqDist wubFD = FreqDist(word for word in tokens) print(wubFD.items()) print('\nOur top 10 most frequent words are:\n\t') print(wubFD.tabulate(10)) import matplotlib wubFD.plot(20)
def get_frequency(demo_file_path): words_lst = list() with open(demo_file_path, 'r') as f: content_lst = f.readlines() # for sentence in content_lst: # sentence = sentence.strip() content_str = " ".join(content_lst) words_lst = content_str.split() fdist = FreqDist(words_lst) import pytest pytest.set_trace() print(fdist.tabulate(40, cumulative=True))
tokens = tagger.tag(tokens2) if len(tokens) == 0: print "empty text found after preprocessing...discard" continue if i[1] == 5 and pos < 500: tuple = (tokens, "good location") documents.append(tuple) pos += 1 if i[1] < 3 and neg < 500: tuple = (tokens, "bad location") documents.append(tuple) neg += 1 list = [] for w in documents: list.append(w[1]) print len(documents) dict = FreqDist(list) dict.tabulate() for w in dict: print w, dict[w] f1.close() fout = open("../classification/location/location.dat", "wb") pickle.dump(documents, fout, protocol=0) fout.close() print "Finish\n"
A = set(allwords) longwords = [w for w in A if len(w) > 12] #单词长度>12的所有单词 print(sorted(longwords)) from nltk.probability import FreqDist, ConditionalFreqDist """ FreqDist: 创建一个所给数据的频率分布 B(): 不同单词的个数 N(): 所有单词的个数 tabulate(20): 把前20组数据以表格的形式显示出来 fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 """ fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) print("不同单词的个数:%d" % fd2.B()) print("所有单词的个数:%d" % fd2.N()) fd2.tabulate(20) #把前20组数据 以表格的形式显示出来 fd2.plot(20) fd2.plot(20, cumulative=True) """ freq('the') #单词the出现的频率 ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异 """ from nltk.corpus import inaugural print(fd2.freq('the')) #单词the出现的频率 cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.word(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot()
def removePeriod(words): count = 0 for comm in words: if '.' in comm: ind = comm.index('.') fi = comm[:ind] ind += 1 la = comm[ind:] words.append(fi) words.append(la) del words[count] count += 1 with open(library + post_id + '.txt', 'r') as file: comments = file.read() file.close() words = word_tokenize(comments) # filterwords = [w for w in words if not w in stop_words] filterwords = [] for w in words: if not (w in stop_words): filterwords.append(w.lower()) fdist = FreqDist(filterwords) print(fdist.tabulate(50)) print(stop_words)
import string import re for line in fileinput.input(files='-'): data = json.loads(line) toolfdist = FreqDist() nontoolfdist = FreqDist() stopwords = nltk.corpus.stopwords.words('english') for i in range(len(data)): text = word_tokenize(data[i]['abstract']) if data[i]['is_tool']: for word in text: word = word.lower() if word not in stopwords and word not in string.punctuation and re.fullmatch(r'[0-9\!\"\#\$\%\&\'\(\)\*\+\,\-.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~≥]*', word) is None: toolfdist[word] += 1 else: for word in text: word = word.lower() if word not in stopwords and word not in string.punctuation and re.fullmatch(r'[0-9\!\"\#\$\%\&\'\(\)\*\+\,\-.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~≥]*', word) is None: nontoolfdist[word] += 1 for word in toolfdist: if word in nontoolfdist: toolfdist[word] -= 10*nontoolfdist[word] #toolfdist[word] = 0 toolfdist.tabulate(200)
def wordFreqAnalysis(dict_books): #%% Retrieving data list_sentences = listof_sentences(dict_books) #%% NLP # Only words - no digits or special characters wordonly_tokenizer = RegexpTokenizer(r'\w+') words_sentences = [ wordonly_tokenizer.tokenize(sentence) for sentence in list_sentences ] # List of list of words words_list = [word for element in words_sentences for word in element] # All lowercase words_lower = [wlo.lower() for wlo in words_list] # Removing stopwords manual_SW = add_stopwords() fullset_SW = stopwords.words('english') + manual_SW stopWords = set(fullset_SW) words_filtered = [w for w in words_lower if w not in stopWords] # Removing single letter words words_letter = [wle for wle in words_filtered if len(wle) > 1] # Removing a2, c45 etc words_alpha = [ wa for wa in words_letter if not any(c.isdigit() for c in wa) ] # Lemmatizations lemmatizer = WordNetLemmatizer() #verb lemmatization words_lemmatized_verb = [ lemmatizer.lemmatize(wv, 'v') for wv in words_alpha ] #adjective lemmatization words_lemmatized_adj = [ lemmatizer.lemmatize(wadj, 'a') for wadj in words_lemmatized_verb ] #adverb lemmatization words_lemmatized_adv = [ lemmatizer.lemmatize(wadv, 'r') for wadv in words_lemmatized_adj ] #noun lemmatization words_lemmatized = [ lemmatizer.lemmatize(wl) for wl in words_lemmatized_adv ] #%% Dictionary of word count wordcount = FreqDist(wc for wc in words_lemmatized) # print("something") # # Suppress console temporarily # sys.stdout = open(os.devnull, "w") wordcount.tabulate(10) # sys.stdout = sys.__stdout__ # print("nothing") #%% Write to table (csv file) file_freqtable = "word_freq.csv" try: with open(file_freqtable, "x") as fp: writer = csv.writer(fp) writer.writerows(wordcount.items()) print(file_freqtable, "file is created") print("\n -----------------------------------------------\n") except IOError: print(file_freqtable, "file already exists") print("\n -----------------------------------------------\n") #%% Word count plot print("Plotting Word Count") print("\n -----------------------------------------------\n") ttl = "Word Frequency for Top 50 Words" plt.figure(figsize=(40, 20)) wordcount.plot(50) plt.title(ttl, fontsize=80) #plt.xlabel("Words", fontsize = 26, style = "oblique") plt.ylabel("Frequency of Words", fontsize=38, style="oblique") ax = plt.gca() ax.set_xticklabels(ax.get_xticklabels(), fontdict={'fontsize': 30}) #yticks = [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] ax.set_yticklabels(ax.get_yticks(), fontdict={'fontsize': 30}) plt.show() #%% Word cloud WC_height = 800 WC_width = 1600 WC_max_words = 500 wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width) # Plotting Word cloud with most frequently occurring words (unigrams) print("Plotting Word Cloud") print("\n -----------------------------------------------\n") wordCloud.generate_from_frequencies(wordcount) plt.figure(figsize=(36, 18)) plt.title('Most frequently occurring words (unigrams)', fontsize=60) plt.imshow(wordCloud, interpolation='bilinear') plt.axis("off") plt.show()