Exemplo n.º 1
0
 def test_plot(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     try:
         empty.plot(conditions="BUG")  # nonexistent keys shouldn't be added
     except:
         pass
     self.assertEqual(empty.conditions(), [])
Exemplo n.º 2
0
#%%
from nltk.corpus import inaugural
from nltk import ConditionalFreqDist
from nltk.probability import FreqDist

fd3 = FreqDist([s for s in inaugural.words()])
print(fd3.freq('freedom'))

# count frequency of words length in decending order
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.words(fileid)
                          if fileid > '1980' and fileid < '2010')

print(cfd.items())
cfd.plot()
# %%
import nltk
wnl = nltk.WordNetLemmatizer()

from nltk.corpus import PlaintextCorpusReader
from nltk import ConditionalFreqDist

corpus = PlaintextCorpusReader('C:/Data/Candidate_tweets/Processing_tweets/By_week_tweets/Cleaned_by_week/', '.*')
corpus.fileids()[0:3]
print len(corpus.words())

cfd = ConditionalFreqDist(
    (target, fileid)
    for fileid in corpus.fileids()
    for w in corpus.words(fileid)
    for target in ['obama', 'romney', 'opponent']
    if w==target)
        
cfd.plot()


cfd = nltk.ConditionalFreqDist(
    (target, fileid)
    for fileid in corpus.fileids()
    for w in corpus.words(fileid)
    for target in ['democrat', 'republican', 'independent']
    if w==target)
        
cfd.plot()


Exemplo n.º 4
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import ConditionalFreqDist

cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア
cfdist.conditions() # アルファード順にソートされた条件のリスト
cfdist['条件'] # 指定された条件の頻度分布
cfdist['条件'][sample] #
cfdist.tablate()
cfdist.tablate(samples,conditions)
cfdist.plot()
cfdist.plot(samples,conditions)
cfdist1 < cfdist2


Exemplo n.º 5
0
    leng=min_length
    english_vocab=set(w.lower() for w in nltk.corpus.words.words())
    full=[]
    while leng!=0:
        temp_let=[list(i)+[obligatory_letter] for i in itertools.combinations(letter_list, leng-1)] 
        temp_words=list(set([w for w in ["".join(j) for i in temp_let for j in itertools.permutations(i)] if w in english_vocab]))
        full=full+temp_words
        leng=leng+1 if len(temp_words)!=0 else 0
    return full
wrds=how_many_words(['e','g','i','v','v','o','n','l'], 'r', min_length=3)

names=nltk.corpus.names
names.fileids()
[w for w in names.words('male.txt') if w in names.words('female.txt')]
cfd=nltk.ConditionalFreqDist([(fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)])
cfd.plot()

entries=nltk.corpus.cmudict.entries()
len(entries)
for entry in entries[100:110]:
    print(entry)

for word, pron in entries:
    if len(pron)==3:
        p1,p2,p3=pron
        if p1=='P' and p3=='T':
            print(word, pron)

from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
print(cfd.tabulate(conditions=genres, samples=modals))  # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could
#                  can could  may might must will  # 每个类别种各个单词的数量
#            news   93   86   66   38   50  389
#        religion   82   59   78   12   54   71
#         hobbies  268   58  131   22   83  264
# science_fiction   16   49    4   12    8   16
#         romance   74  193   11   51   45   43
Exemplo n.º 7
0
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([
    condition for condition in tmp_Con.conditions()
    if len(tmp_Con[condition].keys()) > 1
])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(
    brown.categories()
)  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist(
    (genre, word) for genre in brown.categories()
    for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = [
    'news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'
]  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
print(
Exemplo n.º 8
0
A = nltk.corpus.brown.categories()
print(A)

B = nltk.corpus.brown.words(categories='adventure')
print(B)

#使用 state_union 语料库阅读器,访问《国情咨文报告》的文本。计数每个文档中 出现的 men、women 和 people。
# 随时间的推移这些词的用法有什么变化?

from nltk.corpus import state_union
from nltk import ConditionalFreqDist
cfd = ConditionalFreqDist((target, fileid[:4])
                          for fileid in state_union.fileids()
                          for w in state_union.words(fileid)
                          for target in ['men', 'women', 'people'])
cfd.plot()

#在名字语料库上定义一个条件频率分布,显示哪个首字母在男性名字中比在女性名字 中更常用

from nltk.corpus import names
cfd1 = ConditionalFreqDist((fileid, w.lower()[0])
                           for fileid in names.fileids()
                           for w in names.words(fileid))

cfd1.plot()

##写一个程序,找出所有在布朗语料库中出现至少 3 次的词。
from nltk.corpus import brown
from nltk import FreqDist
words = brown.words()
fd = FreqDist([w.lower() for w in words])
Exemplo n.º 9
0
from nltk.corpus import names
from nltk import ConditionalFreqDist as CondFreqDist

g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)])
n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)])
g2n.plot()
n2g.plot()

Exemplo n.º 10
0
 def test_plot(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     empty.plot(conditions=["BUG"])  # nonexistent keys shouldn't be added
     self.assertEqual(empty.conditions(), [])