示例#1
0
A = set(allwords)
longwords = [w for w in A if len(w) > 12]  #单词长度>12的所有单词
print(sorted(longwords))

from nltk.probability import FreqDist, ConditionalFreqDist
"""
FreqDist: 创建一个所给数据的频率分布
B(): 不同单词的个数
N(): 所有单词的个数
tabulate(20): 把前20组数据以表格的形式显示出来
fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 
"""
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print("不同单词的个数:%d" % fd2.B())
print("所有单词的个数:%d" % fd2.N())
fd2.tabulate(20)  #把前20组数据 以表格的形式显示出来
fd2.plot(20)
fd2.plot(20, cumulative=True)
"""
freq('the')  #单词the出现的频率
ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异
"""
from nltk.corpus import inaugural
print(fd2.freq('the'))  #单词the出现的频率
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.word(fileid)
                          if fileid > '1980' and fileid < '2010')
print(cfd.items())
cfd.plot()
示例#2
0
文件: session-3.py 项目: datakid/nltk
from nltk.probability import ConditionalFreqDist
import matplotlib
% matplotlib inline
cfdist = ConditionalFreqDist()
for filename in os.listdir(corpus_path):
    text = open(os.path.join(corpus_path, filename)).read()
    #split text of file on 'end metadata'
    text = text.split("<!--end metadata-->")
    #parse metadata using previously defined function "parse_metadata"
    metadata = parse_metadata(text[0])
    #skip all speeches for which there is no exact date
    if metadata['Date'][0] == 'c':
        continue
    #build a frequency distribution graph by year, that is, take the final bit of the 'Date' string after '/'
    cfdist['count'][metadata['Date'].split('/')[-1]] += 1
cfdist.plot()

# <markdowncell>
# Now let's build another graph, but this time by the 'Description' field:

# <codecell>
cfdist2 = ConditionalFreqDist()
for filename in os.listdir(corpus_path):
    text = open(os.path.join(corpus_path, filename)).read()
    text = text.split("<!--end metadata-->")
    metadata = parse_metadata(text[0])
    if metadata['Date'][0] == 'c':
        continue
    cfdist2['count'][metadata['Description']] += 1
cfdist2.plot()
y_pos = np.arange(len(x))
plt.yticks(y_pos, x)
plt.title('Frequency Count of Top 20 Tri-Grams')
plt.ylabel('Frequent Words')
plt.xlabel('Count')
plt.show()
plt.savefig('Most Frequent 20 Tri-Grams.jpeg')

# ## 4. Word Length Distribution Plot

# #### This plot is word length on x-axis vs number of words of that length on the y-axis. This plot helps to visualise the composition of different word length in the text corpus.

from nltk.probability import ConditionalFreqDist

cfdist = ConditionalFreqDist((len(word), word) for word in allwords)
cfdist.plot()

# ## t-SNE Corpus Visualization

#  Visualizing document similarity is to use t-distributed stochastic neighbor embedding

from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

docs = tfidf.fit_transform(allwords)

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer()
tsne.fit(docs)