Пример #1
0
def word_fdist(inaug_list):
	
	fixedspeech=map(str.lower, inaugural.words(inaug_list))		#Applies lower to every element of the list
	fixedspeech=filter(checkwords, fixedspeech)			#Filters false words using helper function checkwords	
	fdist = FreqDist(fixedspeech)					#Assigns it to a frequency disposition
	
	return fdist							#Returns it
Пример #2
0
def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()
Пример #3
0
def main():

	cfd = nltk.ConditionalFreqDist(
			 (target, file[:4]) 
			 for fileid in inaugural.fileids()
             for w in inaugural.words(fileid)
             for target in ['democracy', 'republic']
             if w.lower().startswith(target))
	cfd.plot()
Пример #4
0
def print_inaugural():
    from nltk.corpus import inaugural
    cfd=nltk.ConditionalFreqDist(
        (target,file[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america','citizen']
        if w.lower().startswith(target)
    )
    cfd.plot()
Пример #5
0
def build_inaugural_corpus():
    """
    Get a word token list for each doc in the inaugural address corpus
    :return: word_lists
    """
    word_lists = []
    for fileid in inaugural.fileids():
        words = [w for w in inaugural.words(fileid)]
        word_lists.append(words)
    return word_lists
Пример #6
0
def inaugural():

    inaugural.fileids()
    [fileid[:4] for fileid in inaugural.fileids()]

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))
    cfd.plot()
Пример #7
0
def sent_length_fdist(inaug_list):
	fixedspeech=filter(elimpunct, inaugural.words(inaug_list))
	count = 0
	listcount = []
	for x in range(len(fixedspeech)):
		if fixedspeech[x]=='.':
			listcount.append(count-1)
			count=0
		if fixedspeech[x]=='!':
			listcount.append(count-1)
			count=0
		if fixedspeech[x]=='?':
			listcount.append(count-1)
			count=0
		count+=1
	fdlist = FreqDist( listcount)
	return fdlist
Пример #8
0
def tabulate():

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))

    languages = ['Chickasaw', 'English', 'German_Deutsch',
            'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

    cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
            samples=range(10), cumulative=True)
Пример #9
0
def build_cond_fdist():

	cfdist = ConditionalFreqDist()					#Create conditionalFrequency

	for inaug_list in inaug20():					#Go through inaug_list
		period = int(inaug_list[0][0:4])			#Set the period

		for fileid in inaug_list:    				#For all the words in fileid
			words = inaugural.words(fileid)			

			for i in range(len(words)):			#Check all of the words
				pronoun = words[i]			

				if pronoun in ['I', 'my']:		#Print the next word (after I,me)
					cfdist[(pronoun, period)].inc(words[i+1])

				elif pronoun == 'me':                	#Print the previous word (before me)
					cfdist[(pronoun, period)].inc(words[i-1])    

	return cfdist
Пример #10
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Пример #11
0
def sent_length_fdist(inaug_list):

	fixedspeech=filter(elimpunct, inaugural.words(inaug_list))	#Elimintes the punctuation
	count = 0							#Counter to run alongside for loop
	listcount = []							#List of sentences lengths

	for x in range(len(fixedspeech)):

		if fixedspeech[x]=='.':				
			listcount.append(count-1)			#Appends the counter-1 (for the period) to the list for the sentence length
			count=0						#Resets counter for next sentence

		if fixedspeech[x]=='!':
			listcount.append(count-1)			#Appends the counter-1 (for the exclamation mark) to the list for the sentence length
			count=0		

		if fixedspeech[x]=='?':					#Appends the counter-1 (for the question mark) to the list for the sentence length
			listcount.append(count-1)
			count=0
		count+=1

	fdlist = FreqDist( listcount)					#Makes a distribution

	return fdlist
Пример #12
0
print fd2.B()
print fd2.N()

fd2.tabulate(20) # 统计全书前20个出现次数最多的单词书

import matplotlib.pyplot as plt

#fd2.plot(20)
#fd2.plot(20, cumulative=True)

"""
美国总统就职演说预料库

"""
from nltk.corpus import inaugural
import nltk

# nltk.download()  先下载inaugural 语料库

fd3 = FreqDist([s for s in inaugural.words()])
print fd3.freq('freedom') # 在整个语料库中freedom出现的频率

# 用词习惯
cfd = ConditionalFreqDist(# 条件频率统计
	(fileid, len(w))
	for fileid in inaugural.fileids()
	for w in inaugural.words(fileid)
	if fileid > '1960'
	)
print cfd.items()[:40]
cfd.plot()
Пример #13
0
def number_of_word_types(fileid):
    words = inaugural.words(fileid)
    unique_words = _vocabulary(words)
    num_word_types = len(unique_words)
    return num_word_types
Пример #14
0
import nltk
from nltk.corpus import inaugural
from nltk.corpus import wordnet
import random
import re
import math

# 23题_a
# 导入数据
inaugural_words = inaugural.words()
# 创建字典
a = nltk.FreqDist()
# 遍历列表,将词转成小写
fd = nltk.FreqDist([w.lower() for w in inaugural_words])
# 遍历列表,统计对数词频
for key in fd:
    t = math.log10(fd[key])
    a[key] = t

fd2 = dict(fd)
# 将dict 根据词频排序 转成list
voc = sorted(fd2.items(), key=lambda item: item[1], reverse=True)
# 计算 倍数
result = voc[49][1] / voc[149][1]
print(voc[49][0], ' ', voc[149][0])
print("r_a=" + str(result))
# 第一个 与 最后一个
print(voc[0], " ", voc[149])
# 画图
a.plot(150)
Пример #15
0
# 	Run your file everytime something new is added so you can see how it works.
# 	There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py

# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural

print inaugural.fileids()

# Run your file.You should see all the text files containing all the speeches of the US presidents that the
# NLTK has saved inside it.
# Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words("2009-Obama.txt")  # Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents("2005-Bush.txt")  # Returns a list of all the sentences in Bush's speech

# As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

# Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

# The term 'token' means a word or a punctuation mark.
# After you've done that, add the following lines to your program

from nltk.book import *

# This may take a while to load. NLTK has many texts stored in it!
Пример #16
0
        nCatgs[n] = len(reuters.categories(name))
        catgs[n] = ','.join(reuters.categories(name))
        texts[n] = ' '.join(reuters.words(name))

# trimming articles without categories
toTrim = np.invert(np.equal(catgs, None))
catgs = catgs[toTrim]
texts = texts[toTrim]
nCatgs = nCatgs[toTrim]
outNames = outNames[toTrim]

for n in range(len(outNames)):
    with open('./reuters/' + outNames[n] + '.txt', 'w') as f:
        f.writelines('\n'.join(textwrap.wrap(texts[n], 80)))

out = np.vstack((outNames, catgs)).T
out = out[np.argsort(out[:, 0])]
np.savetxt('reuters_catgs.csv', out, fmt='%s', delimiter=',')

## Save inaugural addresses
#nltk.download('inaugural')
from nltk.corpus import inaugural as inaug
adds = inaug.fileids()

texts = np.empty(len(adds), dtype=object)  # pre-allocate
for n, name in enumerate(adds):
    texts[n] = ' '.join(inaug.words(name))
    with open('./inaugural/' + name, 'w') as f:
        tmp = textwrap.wrap(texts[n], 80)
        f.writelines('\n'.join(tmp).encode('ascii', 'ignore'))
Пример #17
0
from nltk.book import *
import nltk

print(text1.vocab())
print(type(text1))
print(len(text1))

from nltk.corpus import gutenberg
print(gutenberg.fileids())
print(nltk.corpus.gutenberg.fileids())
hamlet = gutenberg.words('shakespeare-hamlet.txt')

from nltk.corpus import inaugural
print(inaugural.fileids())
print(nltk.corpus.inaugural.fileids())
from nltk.text import Text
former_president = Text(inaugural.words(inaugural.fileids()[-1]))
print(' '.join(former_president.tokens[0:1000]))

Пример #18
0
import nltk

from nltk.corpus import inaugural
from nltk.util import ngrams

obama_words = inaugural.words("2009-Obama.txt")
george_words = inaugural.words("1789-Washington.txt")
fd_george_words = nltk.FreqDist(w.lower() for w in george_words)
fd_obama_words = nltk.FreqDist(w.lower() for w in obama_words)

#fd_obama_words.plot(50)
print(fd_obama_words.most_common(50))
print(fd_george_words.most_common(50))

obama = [x[0] for x in fd_obama_words.most_common(50)]
george = [x[0] for x in fd_george_words.most_common(50)]

print(list(set(obama) & set(george)))
Пример #19
0
from nltk.corpus import inaugural
from nltk import FreqDist
#nltk.download('stopwords')
from nltk.tokenize import regexp_tokenize
print("-------WARM UP---------")
print("------TASK 1---------")
#using inaugural fileids to list all the documents
documents = inaugural.fileids()
print(
    "Using the corpus reader class list all the documents in inaugural corpus :"
)
print(documents)
print("---------------------------------------------------------------------")
print("Find the total number of words in Clinton’s 1993 speech :")
#using .worrds method to count words in clinton speech
clintonwords = (inaugural.words('1993-Clinton.txt'))

print(len(clintonwords))
#.raw method will read the text in raw form
s = inaugural.raw('1789-Washington.txt')
w = set(m.group(0) for m in re.finditer(r"\w+", s))
#print (len(re.findall('\w+', s)))
print("Find the total number of distinct words in the same speech :")
#now we will find length of distinct words
print(len(w))


# average function to calculate average word length
def average(numbers):
    return sum(numbers) / len(numbers)
Пример #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 08:22:33 2018

@author: jacobjohn

//https://www.jasondavies.com/wordcloud/

"""
import nltk
import re
from nltk.corpus import inaugural

Obama = inaugural.words(fileids='2009-Obama.txt')

#declare a dictionary
word_freq = {}
for tok in Obama:
    if tok in word_freq:
        word_freq[tok] += 1
    else:
        word_freq[tok] = 1

max_dict = {}

while len(max_dict) < 5:
    max_val = 0
    for key in word_freq:
        if max_val < word_freq[key] and re.match(r'[A-Za-z]+',
                                                 key) and key not in max_dict:
Пример #21
0
Created on Sun Dec 24 11:00:43 2017

@author: Mohnish_Devadiga
"""

import nltk
from nltk.corpus import inaugural
import pandas as pd
import matplotlib

inaugural.fileids()

#print(inaugural.fileids())

for speech in inaugural.fileids():
    word_count_total = len(inaugural.words(speech))
    print(speech, word_count_total)

#Go through all speech
speech_length = [(len(inaugural.words(speech)), speech)
                 for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ", max(speech_length))
print("Min is : ", min(speech_length))

#Avg no of words per sentence for each speech
for speech in inaugural.fileids():
    word_total = len(inaugural.words(speech))
Пример #22
0
def conlisttodic(lst):
	dct= dict()
	for i in range(0, len(lst)):
		for j in range(0,1):
			dct.update({lst[i][j]:lst[i][j+1]})
	return dct
    
def makeStopWords():
	sw = stopwords.words('english')
	for i in wordsStop :
		sw.append(i)ห
	return sw
stopWord = makeStopWords()
for fileID in inaugural.fileids()[-12:]:
	wordList=list()
	for word in inaugural.words(fileID):
		word = word.lower()
		if word.isalpha() and word not in stopWord:
			wordList.append(word)
	speech[fileID] = nltk.FreqDist(wordList)
# 			print(type(speech))

for i,k in speech.items() :
	print(i,k)
	nameyear.append(i)
	worddict[countloop] = k
	countloop = countloop+1
intersectionn= worddict[0] & worddict[1] & worddict[2] &worddict[3] & worddict[4] & worddict[5] &worddict [6] & worddict[7] & worddict[8] & worddict[9] & worddict[10] & worddict[11] 
intersectionnsort = sorted(intersectionn)
for i in range(len(nameyear)):
	for j in intersectionnsort:
# In[4]:

print(brown.categories())

# In[5]:

brown.words(categories='romance')

# In[6]:

from nltk.corpus import inaugural

# In[7]:

inaugural.fileids()

# In[8]:

inaugural.words(fileids='1989-Bush.txt')

# In[9]:

inaugural.words(fileids='1989-Bush.txt')[:50]

# In[10]:

from nltk.tokenize import TweetTokenizer
text = "Mexico is paying (indirectly) for the Wall through the new USMCA, the replacement for NAFTA! Far more money coming to the U.S. Because of the tremendous dangers at the Border, including large scale criminal and drug inflow, the United States Military will build the Wall!"
twt = TweetTokenizer()
print(twt.tokenize(text))
from nltk.corpus import inaugural, stopwords
# from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities
import re
import string

filenames = inaugural.fileids()
# lmtzr = WordNetLemmatizer()
filtered_speeches = []

def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

for filename in filenames:
	print filename
	print "Reading in raw words..."
	raw_words = inaugural.words(filename)
	print "Removing stop words..."
	filtered_words = [word for word in raw_words if not word in stopwords.words('english')]
	print "Removing punctuation..."
	filtered_words = [word.strip(string.punctuation) for word in filtered_words]
	filtered_words = [word.lower() for word in filtered_words if word != ""]
	tokens_once = set(word for word in set(filtered_words) if filtered_words.count(word) == 1)
	filtered_words = [removeNonAscii(word) for word in filtered_words if word not in tokens_once]
	print "Appending filtered words..."
	filtered_speeches.append(filtered_words)

print "making numbered corpus..."
dictionary = corpora.Dictionary(filtered_speeches)
corpus = [dictionary.doc2bow(text) for text in filtered_speeches]
tfidf = models.TfidfModel(corpus)
Пример #25
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")
Пример #26
0
def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")
Пример #27
0
print(md[:8])
print("Length of book {}".format(len(md)))
print("Boat: {}".format(md.count('boat')))

md_set = set(md)
print("Unique by set: {}".format(len(md_set)))

print("Average by words: {}".format(len(md) / len(md_set)))

md_sents = nltk.corpus.gutenberg.sents("melville-moby_dick.txt")
print("Average by words per sentence: {}".format(len(md) / len(md_sents)))

print("inaugral Ids:\n{}".format(inaugural.fileids()))

for speech in inaugural.fileids():
    words_total = len(inaugural.words(speech))
    print("Speech: {0} has total words: {1}".format(speech, words_total))

speech_len = [(len(inaugural.words(speech)), speech)
              for speech in inaugural.fileids()]
print("Biggest Speech: {}".format(max(speech_len)))
print("shortest Speech: {}".format(min(speech_len)))

for speech in inaugural.fileids():
    words_total = len(inaugural.words(speech))
    sentence_total = len(inaugural.sents(speech))
    print("Sentence average: {}".format(words_total / sentence_total))

data = pd.DataFrame([
    int(speech[:4]),
    len(inaugural.words(speech)) / len(inaugural.sents(speech))
Пример #28
0
# In[7]:


#INAUGURAL CORPUS


# In[8]:


from nltk.corpus import inaugural


# In[9]:


inaugural.fileids()


# In[10]:


inaugural.words(fileids = '2009-Obama.txt')


# In[11]:


inaugural.words(fileids = '2009-Obama.txt')[:23]

Пример #29
0
def word_fdist(inaug_list):
	fixedspeech=map(str.lower, inaugural.words(inaug_list))
	fixedspeech=filter(checkwords, fixedspeech)
	fdist = FreqDist(fixedspeech)	
	return fdist
Пример #30
0
from nltk.corpus import inaugural

import matplotlib.pyplot as plt
x=inaugural.words('2009-Obama.txt')
l={}
new=[]
k={}
z=set(x)

for word in z:
 
    l[word]=x.count(word)
#print(l)
from nltk.stem import PorterStemmer
ps=PorterStemmer()
for words in x:
    new.append(ps.stem(words))
p=set(new)
for w in p:
    k[w]=new.count(w)
plt.plot(k.values())
#plt.xlabel(k.keys())
k_sorted = sorted(k.items(), key=operator.itemgetter(1),reverse=True)
for word,count in k.items():
    if(count==max(k.values())):
        print(word)

print(k_sorted[0])
Пример #31
0
def chi_square(word_one, word_two, corpus):
    word_list = []
    #Import the necessary corpus
    if corpus == "brown":
        from nltk.corpus import brown
        word_list = brown.words()
    elif corpus == "reuters":
        from nltk.corpus import reuters
        word_list = reuters.words()
    elif corpus == "gutenberg":
        from nltk.corpus import gutenberg
        word_list = gutenberg.words()
    elif corpus == "webtext":
        from nltk.corpus import webtext
        word_list = webtext.words()
    elif corpus == "inaugural":
        from nltk.corpus import inaugural
        word_list = inaugural.words()
    #Get the frequencies of each word
    w1 = word_list.count(word_one)
    w2 = word_list.count(word_two)
    #Get the frequencies of the word as a collocation
    bigrams = nltk.bigrams(word_list)
    freq_dist = nltk.FreqDist(bigrams)
    w1w2 = 0
    w1andnotw2 = 0
    notw1andw2 = 0
    notw1andnotw2 = 0
    total_words = len(word_list)
    for k, v in freq_dist:
        if k == word_one and v == word_two:
            w1w2 = w1w2 + 1
        elif k == word_one and v != word_two:
            w1andnotw2 = w1andnotw2 + 1
        elif k != word_one and v == word_two:
            notw1andw2 = notw1andw2 + 1
    notw1andnotw2 = notw1andw2 + w1andnotw2
    totalw1andw2 = w1w2 + w1andnotw2 + notw1andw2 + notw1andnotw2
    first_row = w1w2 + w1andnotw2
    second_row = notw1andw2 + notw1andnotw2
    first_col = w1w2 + notw1andw2
    second_col = w1andnotw2 + notw1andnotw2
    #Calculate chi-square value
    #Null hypothesis is that there is no collocation between the two words (no relationship)
    #Estimated value for each cell
    value_one = (first_row * first_col) / totalw1andw2
    value_two = (second_row * first_col) / totalw1andw2
    value_three = (first_row * second_col) / totalw1andw2
    value_four = (second_row * second_col) / totalw1andw2
    x2 = ((w1w2 - value_one)**2) / value_one
    x2 = x2 + ((w1andnotw2 - value_three)**2) / value_three
    x2 = x2 + ((notw1andnotw2 - value_four)**2) / value_four
    x2 = x2 + ((notw1andw2 - value_two)**2) / value_two
    #Print out the results
    print("C(w1): ", w1)
    print("C(w2): ", w2)
    print("C(w1w2): ", w1w2)
    print("C(w1 && !w2)", w1andnotw2)
    print("C(!w1 && w2)", notw1andw2)
    print("C(!w1 && !w2)", notw1andnotw2)
    print("Total Words: ", total_words)
    print("")
    print("0.05% Baseline: 3.841")
    print("X^2:", x2)
    collocation = None
    degrees_of_freedom = 1
    if x2 <= 3.841:
        collocation = True
    else:
        collocation = False
    print("Do we have a collocation?", collocation)
Пример #32
0
#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus
print(str(nltk.corpus.brown).replace('\\\\', '/'))
# The Penn Treebank Corpus:
print(str(nltk.corpus.treebank).replace('\\\\', '/'))
# The Name Genders Corpus:
print(str(nltk.corpus.names).replace('\\\\', '/'))
# The Inaugural Address Corpus:
print(str(nltk.corpus.inaugural).replace('\\\\', '/'))
print(str(nltk.corpus.treebank.fileids()))  # doctest: +ELLIPSIS
#print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS
# Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus.

from nltk.corpus import inaugural
print(inaugural.raw('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.words('1789-Washington.txt'))
print(inaugural.sents('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.paras(
    '1789-Washington.txt'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE

#

l1 = len(inaugural.words('1789-Washington.txt'))
l2 = len(inaugural.words('1793-Washington.txt'))
l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
print('%s+%s == %s' % (l1, l2, l3))

print(len(inaugural.words()))

print(inaugural.readme())
Пример #33
0
# In[13]:

from nltk.corpus import inaugural

# In[15]:

inaugural.fileids()

# In[16]:

len(inaugural.fileids())

# In[19]:

inaugural.words(fileids='1861-Lincoln.txt')[:20]

# In[20]:

print(len(inaugural.words(fileids='1861-Lincoln.txt')))

# In[24]:

inaugural.words(fileids='2009-Obama.txt')[:5]

# In[22]:

print(len(inaugural.words(fileids='2009-Obama.txt')))

# In[26]:
Пример #34
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# import nltk
# nltk.download('inaugural')
import os


from nltk.corpus import inaugural


corpus_from_paragraphs = inaugural.paras(os.path.dirname(__file__) + '/dataset/paragraphs.txt')
corpus_from_sentences = inaugural.sents(os.path.dirname(__file__) + '/dataset/sentences.txt')
corpus_from_words = inaugural.words(os.path.dirname(__file__) + '/dataset/words.txt')

l1 = len(corpus_from_paragraphs)
l2 = len(corpus_from_sentences)
l3 = len(corpus_from_words)
# l2 = 0
# l3 = 0
print('paragraphs: %s, sentences: %s, words: %s' % (l1, l2, l3))

# print(inaugural.readme())
Пример #35
0
# ## INAUGURAL CORPUS

# In[10]:

from nltk.corpus import inaugural

# In[11]:

inaugural.fileids()

# ### LINCOLN

# In[15]:

inaugural.words(fileids='1861-Lincoln.txt')

# In[16]:

inaugural.words(fileids='1861-Lincoln.txt')[:5]

# ### OBAMA

# In[18]:

inaugural.words(fileids='2009-Obama.txt')

# In[21]:

inaugural.words(fileids='2009-Obama.txt')[:20]
Пример #36
0
from nltk.corpus import reuters

reuters.fileids()
reuters.categories(['training/9865', 'training/8666'])
reuters.fileids(['barley', 'corn'])
reuters.words('training/9865')[:14]
reuters.words(categories=['corn', 'barley'])

from nltk.corpus import inaugural

inaugural.fileids()
inaugYears = [fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen']
                               if w.lower().startswith(target))
cfd.plot()

from nltk.corpus import udhr

languages = [
    'English', 'Finnish_Suomi', 'Italian_Italiano', 'Greenlandic_Inuktikut'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=False, title='Declaration of Human Rights')

from nltk.corpus import PlaintextCorpusReader
Пример #37
0
#You must work in your Dropbox folder so we can see your progress.
#Run your file everytime something new is added so you can see how it works.
#There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py

# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural
print inaugural.fileids()

#Run your file.You should see all the text files containing all the speeches of the US presidents that the
#NLTK has saved inside it.
#Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words(
    '2009-Obama.txt')  #Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents(
    '2005-Bush.txt')  #Returns a list of all the sentences in Bush's speech

#As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

#Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

#The term 'token' means a word or a punctuation mark.
#After you've done that, add the following lines to your program

from nltk.book import *
Пример #38
0
def cfd(text, tgt_list):
    from nltk.corpus import inaugural
    cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in tgt_list if w.lower().startswith(target))
    #cfd.plot()
    return cfd

    
Пример #39
0
#############
#Corpus data#
#############

# Inaugural Address Corpus

from nltk.corpus import inaugural
inaugural.fileids()[:2]
[fileid[:4] for fileid in inaugural.fileids()]

#How the words America and citizen are used over time.

cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'war']
                               if w.lower().startswith(target))
cfd.plot()
#cfd.tabulate()

from nltk.corpus import brown
news_words = brown.words(categories="news")
print(news_words)
freq = nltk.FreqDist(news_words)
freq.plot(30)

from nltk import FreqDist
verbs = ["should", "may", "can"]
genres = ["news", "government", "romance"]
for g in genres:
Пример #40
0
import nltk
text = '''Donald John Trump (born June 14, 1946) is the 45th and
current President of the United States. Before entering politics,
he was a businessman and television personality. 
'''
grammer = 'Chunk:{<PRP><VB.+><DT>?<NN.?>}'
pos = nltk.pos_tag(nltk.word_tokenize(text))
parser = nltk.RegexpParser(grammer)
chunked = parser.parse(pos)  #input for parse is list of tuples

for i in chunked.subtrees():
    if i.label() == "Chunk":
        print(i.leaves())

from nltk.corpus import inaugural
text = inaugural.words()
pos = nltk.pos_tag(text)

# JJ +NN but the NN will be 'people'
# !!!!!! important always allow DT iin this kind of structures
jjlist = []
parser = nltk.RegexpParser('chunk:{<JJ.?>+<NN.*>}')
chunk = parser.parse(pos)
for i in chunk.subtrees():
    if i.label() == "chunk" and i.leaves()[-1][0] == 'people':
        # i.leaves()=[('american', 'JJ),('people', 'NN')]
        #i.leaves()[-1] = ('people','NN')
        jj = [x[0] for x in i.leaves()[:-1]]
        jjlist += jj

frequency = nltk.FreqDist(jjlist)
Пример #41
0
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))
Пример #42
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 08:19:21 2018

@author: jacobjohn
"""
import nltk
from nltk.corpus import inaugural
import matplotlib.pyplot

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])  #first four characters - years
    for fileid in inaugural.fileids() for w in inaugural.words(fileid)
    for target in ['america', 'citizen'] if w.lower().startswith(target))
cfd.plot()
Пример #43
0
def answers():
    _rvals = []

    #### Question 1 ####
    print '##### Question 1 #####'
    print '(see code - lines 64-65)'
    print '(NB: the two variables are returned by this function)'
    _bush01 = inaugural.words('2001-Bush.txt')
    bush01_word_lengths = _lengths(_vocabulary(_bush01))
    fd_bush01_words = FreqDist(_nopunct(_bush01))
    _rvals.append(bush01_word_lengths)
    _rvals.append(fd_bush01_words)

    #### Question 2 ####
    print '\n##### Question 2 #####'
    bush01_top10_words = _firsts(fd_bush01_words.items()[:10])
    bush01_average_word_lengths = _avg(bush01_word_lengths)
    _obama09 = inaugural.words('2009-Obama.txt')
    _fd_obama09_words = FreqDist(_nopunct(_obama09))
    _obama09_word_lengths = _lengths(_vocabulary(_obama09))
    obama09_top10_words = _firsts(_fd_obama09_words.items()[:10])
    obama09_average_word_lengths = _avg(_obama09_word_lengths)
    print 'top10 words Bush (2001): ', _str(bush01_top10_words)
    print 'top10 words Obama (2009):', _str(obama09_top10_words)
    print 'average word length Bush (2001): ', bush01_average_word_lengths
    print 'average word length Obama (2009):', obama09_average_word_lengths

    #### Question 3 ####
    print '\n##### Question 3 #####'
    bush01_token_lengths = _avg(_lengths(_nopunct(_bush01)))
    obama09_token_lengths = _avg(_lengths(_nopunct(_obama09)))
    print 'average token length Bush (2001): ', bush01_token_lengths
    print 'average token length Obama (2009):', obama09_token_lengths

    #### Question 4 ####
    print '\n##### Question 4 #####'
    for _fileid in inaugural.fileids():
        _year = int(_fileid.split('-')[0])
        _vocab_size = number_of_word_types(_fileid)
        print 'year %d: %d word types' % (_year, _vocab_size)

    #### Question 5 ####
    print '\n##### Question 5 #####'
    fd_bush01_nostop = FreqDist(_nostops(_nopunct(_bush01)))
    fd_obama09_nostop = FreqDist(_nostops(_nopunct(_obama09)))
    bush01_top10_nostop = _firsts(fd_bush01_nostop.items()[:10])
    obama09_top10_nostop = _firsts(fd_obama09_nostop.items()[:10])
    print 'top10 non-stop-words Bush (2001): ', _str(bush01_top10_nostop)
    print 'top10 non-stop-words Obama (2009):', _str(obama09_top10_nostop)

    #### Question 6 ####
    print '\n##### Question 6 #####'
    _wash89 = inaugural.words('1789-Washington.txt')
    fd_wash89_nostop = FreqDist(_nostops(_nopunct(_wash89)))
    wash89_top10_nostop = _firsts(fd_wash89_nostop.items()[:10])
    print 'top10 non-stop-words Washington (1789):', _str(wash89_top10_nostop)

    #### Question 7 ####
    print '\n##### Question 7 #####'
    wash89_rank_country = rank(fd_wash89_nostop, 'country')
    obama09_rank_country = rank(fd_obama09_nostop, 'country')
    bush01_rank_country = rank(fd_bush01_nostop, 'country')
    print 'rank of "country" in Washington (1789):', wash89_rank_country
    print 'rank of "country" in Obama (2009):', obama09_rank_country
    print 'rank of "country" in Bush (2001):', bush01_rank_country

    #### Question 8 ####
    print '\n##### Question 7 #####'
    print '(see comments in "rank" function on lines 20-45)'
    
    #### Question 9 ####
    print '\n##### Question 9 #####'
    print '(see plot)'
    ff = inaugural.fileids()
    fdd = {}
    _years = []
    for _fileid in ff:
        fdd[_fileid] = FreqDist(_nostops(inaugural.words(_fileid)))
        _years.append(_fileid[0:4])
    pylab.plot([(lambda d: len(d) / float(d.N()))(fdd[f]) for f in ff])
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('ratio of word types to tokens (without stop-words)')
    pylab.xlabel('time')
    pylab.title('f(time) = #(word types) / #(word tokens)')
    pylab.show()

    #### Question 10 ####
    print '\n##### Question 10 #####'
    print '(see plot)'
    obama09top10_butnot_wash89top10 = [word for word in obama09_top10_nostop
        if word in fd_wash89_nostop and word not in wash89_top10_nostop]
    wash89top10_butnot_obama09top10 = [word for word in wash89_top10_nostop
        if word in fd_obama09_nostop and word not in obama09_top10_nostop]
    obama09_word = 'world'
    wash89_word = 'government'
    assert(wash89_word in wash89top10_butnot_obama09top10)
    assert(obama09_word in obama09top10_butnot_wash89top10)
    normalisation_justification = (\
    "We normalise for different sizes in vocabulary by dividing the rank of "
    "some word by the size of the vocabulary in that speech"
    "Since rank is in relation with vocabulary size, this is similar to "
    "getting the maximum rank over all speeches and dividing each rank by that "
    "quantity")
    print normalisation_justification
    _normalised_rank = lambda f, w: min(1, rank(fdd[f], w) / \
        float(len(_vocabulary(_nostops(fdd[f])))))
    pylab.plot([_normalised_rank(f, obama09_word) for f in ff],
        label=obama09_word, color='b')
    pylab.plot([_normalised_rank(f, wash89_word) for f in ff],
        label=wash89_word, color='r')
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('normalised word rank (lower is better)')
    pylab.xlabel('time')
    pylab.title('f(time) = word rank / vocabulary size')
    pylab.legend()
    pylab.show()

    #### Question 11 ####
    print '\n##### Question 11 #####'
    observations_on_plots = (\
    "We observe that the rank of 'world' is noisy when observed on the level "
    "of some individual year/inaugural speech. However, when looking at the "
    "larger picture, a trend emerges: 'world''s rank is consistenlty getting "
    "higher over time - an indicator for an ever-globalising and shrinking "
    "world?"
    "\n"
    "We observe that 'government' is a consistently highly ranked word across "
    "time - expcept for some few inaugural speeches where it has a very low "
    "rank. Those speeches are around the early 1800s (abolishment of slavery),"
    " 1860s-70s (US civil war), the early 1900s (Word War One), and 1937-1981"
    "(World War Two + Cold War) - it would seem that presidents don't want to "
    "remind their subjugates of the government during hard times. Outliers to "
    "this theory can be explained easily (e.g. somewhat high rank of "
    "'government' in 1949 = a certain 'evil government' being defeated).")
    print observations_on_plots

    return _rvals
for word1, word2 in product(word_list, word_list):
    count = 0

    n_grams = ngrams(brown.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(treebank.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(inaugural.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(names.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(gutenberg.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
Пример #45
0
from nltk.corpus import inaugural as inag
from nltk import ConditionalFreqDist as CondFreqDist
cfd = CondFreqDist([(target , fileid[:4])\
		for fileid in inag.fileids() \
			for word in inag.words(fileid) \
				for target in ["wealth" , "peace" , "harmony" , "prosperous"] if word.lower().startswith(target)
		])
cfd.plot()
Пример #46
0
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
Пример #47
0
__author__ = 'auroua'
from nltk.corpus import inaugural
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from lda_1 import LDA
import seaborn as sns

stops = set(stopwords.words("english"))

vocab = dict()
for fileid in inaugural.fileids():
    for word in inaugural.words(fileid):
        word = word.lower()
        if word not in stops and word.isalpha():
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1

"""
Sort the vocab keep only words which occur more than 50 times
Then Create word to id and id to word dictionaries
"""
vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True))
wordids = {v[0]: i for i, v in enumerate(vocab_sorted)}
idwords = {i: v[0] for i, v in enumerate(vocab_sorted)}
vocab_size = len(wordids)
print vocab_size

# Generate corpus document vectors
data = []
Пример #48
0
# from nltk.corpus import gutenberg
# # print(gutenberg.fileids())
# allwords = gutenberg.words('shakespeare-hamlet.txt')
# print(len(allwords))
# print(len(set(allwords)))
# print(allwords.count('Hamlet'))
# A = set(allwords)
# longwords = [w for w in A if len(w)>12]
# print(sorted(longwords))
#
#
from nltk.probability import *
# fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
# print(fd2.B())
# print(fd2.N())
# # fd2.tabulate(20)
# fd2.plot(20)
# # fd2.plot(20,cumulative = True)

from nltk.corpus import inaugural
fd3 = FreqDist([s for s in inaugural.words()])
print(fd3.freq('freedom'))

cfd = ConditionalFreqDist(
    (fileid,len(w))
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    if fileid>'1980' and fileid<'2010'
)
print(cfd.items())
cfd.plot()
webtext_words = webtext.words()
print(webtext_words)

# Pick out the text from np_chat corpus and name it as nps_chat_raw
nps_chat_raw = nps_chat.raw()

# Pick out the text from brown corpus and name it as brown_raw
brown_raw = brown.raw()
print(brown_raw)

# Pick out the text from reuters corpus and name it as reuters_words
reuters_words = reuters.words()
print(reuters_words)

# Pick out the text from inaugural corpus and name it as inaugral_raw
inaugral_words = inaugural.words()
print(inaugral_words)

# Creating a variable for tokenizing words
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens
tokens = tokenizer.tokenize(gutenberg_raw)


# Assigning the stopwords to a variable s
s=set(stopwords.words('english'))

# Removing the stopwords from gutenberg file
gutenberg_filtered = filter(lambda w: not w in s,tokens)
Пример #50
0
from nltk.corpus import reuters
reuters.fileids()
reuters.categories(['training/9865', 'training/8666'])
reuters.fileids(['barley','corn'])
reuters.words('training/9865')[:14]
reuters.words(categories = ['corn','barley'])

from nltk.corpus import inaugural
inaugural.fileids()
inaugYears = [fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america','citizen']
    if w.lower().startswith(target))
cfd.plot()

from nltk.corpus import udhr
languages = ['English','Finnish_Suomi','Italian_Italiano', 'Greenlandic_Inuktikut']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative = False, title = 'Declaration of Human Rights')

from nltk.corpus import PlaintextCorpusReader
corpusRoot = '/home/mv/Dropbox/Computer/UbuntuInstall'
wordlists = PlaintextCorpusReader(corpusRoot,'.*')
Пример #51
0
Created on Sun Dec 24 11:00:43 2017

@author: Mohnish_Devadiga
"""

import nltk
from nltk.corpus import inaugural
import pandas as pd
import matplotlib 

inaugural.fileids()

#print(inaugural.fileids())

for speech in inaugural.fileids():
    word_count_total = len(inaugural.words(speech))
    print(speech , word_count_total)
    
#Go through all speech     
speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ",max(speech_length))
print("Min is : ",min(speech_length))

#Avg no of words per sentence for each speech
for speech in inaugural.fileids():
    word_total = len(inaugural.words(speech))
    Sents_total = len(inaugural.sents(speech))
#importing library
from nltk.corpus import inaugural


# In[6]:


inaugural.fileids()


# In[7]:


#printing inaugral words for some text
for i in inaugural.words('1933-Roosevelt.txt'):
    print(i, end = " ")


# In[8]:


'''
College is so hectic,I'm tired
'''


# In[9]:


#importing library from nltk.corpus
Пример #53
0
def text4():
    text = Text(inaugural.words(), name="Inaugural Address Corpus")
    print("text4:", text.name)
    return text
Пример #54
0
from itertools import groupby

from nltk import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import inaugural
from nltk.tag import StanfordNERTagger
from nltk.tree import Tree

# Uncomment to check the required StanfordNERTagger environment variables.
# print os.environ.get("CLASSPATH")
# print os.environ.get("STANFORD_MODELS")

# Read the corpus and POS tag it.
POS_tagging = pos_tag(inaugural.words())

# Process the corpus with the NLTK named entity classifier.
ne_nltk = ne_chunk(POS_tagging)

# Filter out in a list only the organization entities. Join by space words that are part of the same organization entity (same Tree object).
nltk_organizations = [
    " ".join(w[0] for w in el) for el in ne_nltk
    if (type(el) == Tree and el.label() == "ORGANIZATION")
]

# Remove duplicates.
nltk_organizations = set(nltk_organizations)

# Filter out in a list only the person entities. Join by space words that are part of the same person entity (same Tree object).
nltk_persons = [
    " ".join(w[0] for w in el) for el in ne_nltk
    if (type(el) == Tree and el.label() == "PERSON")
Пример #55
0
cfd.plot(cumulative = True)
cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True)
#条件频率分布
genre_word = [(genre, word) for genre in ['news', 'romance'] 
for word in brown.words(categories=genre)]

cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()
list(cfd['romance'])
cfd['romance']['could']

from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))
#随机产生文本
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
print cfd['living']
generate_model(cfd, 'living')


def unusual_words(text):
genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
print(len(genre_word))  # 170576 个词类
print(genre_word[:4])  # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')] # [_start-genre]
print(genre_word[-4:])  # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')] # [_end-genre]
cfd = ConditionalFreqDist(genre_word)
print(cfd)  # <ConditionalFreqDist with 2 conditions>
print(cfd.conditions())  # ['news', 'romance'] # [_conditions-cfd]
print(cfd['news'])  # <FreqDist with 14394 samples and 100554 outcomes>
print(cfd['romance'])  # <FreqDist with 8452 samples and 70022 outcomes>
print(cfd['romance'].most_common(2))  # [(',', 3899), ('.', 3736)]
print(cfd['romance']['could'])  # 193
print(cfd['romance'].max())  # 找到 romance 中最大的
print(cfd['romance'][','])  # 3899
##################################################################
## plot() how the words America and citizen are used over time; 美国总统就职演讲, 使用 America 和 citizen 情况
cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for word in inaugural.words(fileid) for target in ['america', 'citizen'] if word.lower().startswith(target))
cfd.plot()  # 绘制演讲中出现 America 和 citizen 次数
##################################################################
## tabulate(); 提取词对
# Next, let's combine regular expressions with conditional frequency distributions.
# Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair,
# it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
print(cvs[:10])  # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko']
cfd = ConditionalFreqDist(cvs)
cfd.tabulate()
#     a    e    i    o    u
# k  418  148   94  420  173
# p   83   31  105   34   51
# r  187   63   84   89   79
def process_speech(filename):
	text = inaugural.words(filename)
	text = remove_punctuation(text)
	text = remove_stopwords(text)
	text = clean(text)
	return text
Пример #58
0
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 20 16:45:50 2019

@author: Ritwik Gupta
"""

#20/12/19

from nltk.corpus import brown
brown.categories()
print(brown.words(categories='hobbies')[0:5])

from nltk.corpus import inaugural
inaugural.fileids()
inaugural.words(fileids='1933-Roosevelt.txt')[0:10]

from nltk.corpus import webtext
d1 = {}
for i in webtext.fileids():
    d1[i] = webtext.words(fileids=i)[:20]

#Downloaded the MASC data
import nltk
with open('tweets1.txt', 'r') as f:
    text = f.read().strip()
    text1 = text.split()
    text2 = nltk.Text(text1)
    text2.concordance("good", 1)

#Project Gutenberg