A = set(allwords) longwords = [w for w in A if len(w) > 12] #单词长度>12的所有单词 print(sorted(longwords)) from nltk.probability import FreqDist, ConditionalFreqDist """ FreqDist: 创建一个所给数据的频率分布 B(): 不同单词的个数 N(): 所有单词的个数 tabulate(20): 把前20组数据以表格的形式显示出来 fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 """ fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) print("不同单词的个数:%d" % fd2.B()) print("所有单词的个数:%d" % fd2.N()) fd2.tabulate(20) #把前20组数据 以表格的形式显示出来 fd2.plot(20) fd2.plot(20, cumulative=True) """ freq('the') #单词the出现的频率 ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异 """ from nltk.corpus import inaugural print(fd2.freq('the')) #单词the出现的频率 cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.word(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot()
from nltk.probability import ConditionalFreqDist import matplotlib % matplotlib inline cfdist = ConditionalFreqDist() for filename in os.listdir(corpus_path): text = open(os.path.join(corpus_path, filename)).read() #split text of file on 'end metadata' text = text.split("<!--end metadata-->") #parse metadata using previously defined function "parse_metadata" metadata = parse_metadata(text[0]) #skip all speeches for which there is no exact date if metadata['Date'][0] == 'c': continue #build a frequency distribution graph by year, that is, take the final bit of the 'Date' string after '/' cfdist['count'][metadata['Date'].split('/')[-1]] += 1 cfdist.plot() # <markdowncell> # Now let's build another graph, but this time by the 'Description' field: # <codecell> cfdist2 = ConditionalFreqDist() for filename in os.listdir(corpus_path): text = open(os.path.join(corpus_path, filename)).read() text = text.split("<!--end metadata-->") metadata = parse_metadata(text[0]) if metadata['Date'][0] == 'c': continue cfdist2['count'][metadata['Description']] += 1 cfdist2.plot()
y_pos = np.arange(len(x)) plt.yticks(y_pos, x) plt.title('Frequency Count of Top 20 Tri-Grams') plt.ylabel('Frequent Words') plt.xlabel('Count') plt.show() plt.savefig('Most Frequent 20 Tri-Grams.jpeg') # ## 4. Word Length Distribution Plot # #### This plot is word length on x-axis vs number of words of that length on the y-axis. This plot helps to visualise the composition of different word length in the text corpus. from nltk.probability import ConditionalFreqDist cfdist = ConditionalFreqDist((len(word), word) for word in allwords) cfdist.plot() # ## t-SNE Corpus Visualization # Visualizing document similarity is to use t-distributed stochastic neighbor embedding from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() docs = tfidf.fit_transform(allwords) # Create the visualizer and draw the vectors tsne = TSNEVisualizer() tsne.fit(docs)