def word_fdist(inaug_list): fixedspeech=map(str.lower, inaugural.words(inaug_list)) #Applies lower to every element of the list fixedspeech=filter(checkwords, fixedspeech) #Filters false words using helper function checkwords fdist = FreqDist(fixedspeech) #Assigns it to a frequency disposition return fdist #Returns it
def main(): # gutenberg gu_words = gutenberg.words() gu_words_exclude_stops = exclude_stopwords(gu_words) gu_fd1 = get_frequency_distribution(gu_words) gu_fd2 = get_frequency_distribution(gu_words_exclude_stops) pylab.plot(gu_fd1, color='red') pylab.plot(gu_fd2, color='orange') # inaugural in_words = inaugural.words() in_words_exclude_stops = exclude_stopwords(in_words) in_fd1 = get_frequency_distribution(in_words) in_fd2 = get_frequency_distribution(in_words_exclude_stops) pylab.plot(in_fd1, color='black') pylab.plot(in_fd2, color='gray') # reuters yen_words = reuters.words(categories='yen') yen_words_exclude_stops = exclude_stopwords(yen_words) yen_fd1 = get_frequency_distribution(yen_words) yen_fd2 = get_frequency_distribution(yen_words_exclude_stops) pylab.plot(yen_fd1, color='blue') pylab.plot(yen_fd2, color='green') pylab.xscale('log') pylab.yscale('log') pylab.show()
def main(): cfd = nltk.ConditionalFreqDist( (target, file[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['democracy', 'republic'] if w.lower().startswith(target)) cfd.plot()
def print_inaugural(): from nltk.corpus import inaugural cfd=nltk.ConditionalFreqDist( (target,file[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target) ) cfd.plot()
def build_inaugural_corpus(): """ Get a word token list for each doc in the inaugural address corpus :return: word_lists """ word_lists = [] for fileid in inaugural.fileids(): words = [w for w in inaugural.words(fileid)] word_lists.append(words) return word_lists
def inaugural(): inaugural.fileids() [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot()
def sent_length_fdist(inaug_list): fixedspeech=filter(elimpunct, inaugural.words(inaug_list)) count = 0 listcount = [] for x in range(len(fixedspeech)): if fixedspeech[x]=='.': listcount.append(count-1) count=0 if fixedspeech[x]=='!': listcount.append(count-1) count=0 if fixedspeech[x]=='?': listcount.append(count-1) count=0 count+=1 fdlist = FreqDist( listcount) return fdlist
def tabulate(): cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def build_cond_fdist(): cfdist = ConditionalFreqDist() #Create conditionalFrequency for inaug_list in inaug20(): #Go through inaug_list period = int(inaug_list[0][0:4]) #Set the period for fileid in inaug_list: #For all the words in fileid words = inaugural.words(fileid) for i in range(len(words)): #Check all of the words pronoun = words[i] if pronoun in ['I', 'my']: #Print the next word (after I,me) cfdist[(pronoun, period)].inc(words[i+1]) elif pronoun == 'me': #Print the previous word (before me) cfdist[(pronoun, period)].inc(words[i-1]) return cfdist
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def sent_length_fdist(inaug_list): fixedspeech=filter(elimpunct, inaugural.words(inaug_list)) #Elimintes the punctuation count = 0 #Counter to run alongside for loop listcount = [] #List of sentences lengths for x in range(len(fixedspeech)): if fixedspeech[x]=='.': listcount.append(count-1) #Appends the counter-1 (for the period) to the list for the sentence length count=0 #Resets counter for next sentence if fixedspeech[x]=='!': listcount.append(count-1) #Appends the counter-1 (for the exclamation mark) to the list for the sentence length count=0 if fixedspeech[x]=='?': #Appends the counter-1 (for the question mark) to the list for the sentence length listcount.append(count-1) count=0 count+=1 fdlist = FreqDist( listcount) #Makes a distribution return fdlist
print fd2.B() print fd2.N() fd2.tabulate(20) # 统计全书前20个出现次数最多的单词书 import matplotlib.pyplot as plt #fd2.plot(20) #fd2.plot(20, cumulative=True) """ 美国总统就职演说预料库 """ from nltk.corpus import inaugural import nltk # nltk.download() 先下载inaugural 语料库 fd3 = FreqDist([s for s in inaugural.words()]) print fd3.freq('freedom') # 在整个语料库中freedom出现的频率 # 用词习惯 cfd = ConditionalFreqDist(# 条件频率统计 (fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.words(fileid) if fileid > '1960' ) print cfd.items()[:40] cfd.plot()
def number_of_word_types(fileid): words = inaugural.words(fileid) unique_words = _vocabulary(words) num_word_types = len(unique_words) return num_word_types
import nltk from nltk.corpus import inaugural from nltk.corpus import wordnet import random import re import math # 23题_a # 导入数据 inaugural_words = inaugural.words() # 创建字典 a = nltk.FreqDist() # 遍历列表,将词转成小写 fd = nltk.FreqDist([w.lower() for w in inaugural_words]) # 遍历列表,统计对数词频 for key in fd: t = math.log10(fd[key]) a[key] = t fd2 = dict(fd) # 将dict 根据词频排序 转成list voc = sorted(fd2.items(), key=lambda item: item[1], reverse=True) # 计算 倍数 result = voc[49][1] / voc[149][1] print(voc[49][0], ' ', voc[149][0]) print("r_a=" + str(result)) # 第一个 与 最后一个 print(voc[0], " ", voc[149]) # 画图 a.plot(150)
# Run your file everytime something new is added so you can see how it works. # There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py # === Part 1: Importing Corpuses === import nltk from nltk.corpus import inaugural print inaugural.fileids() # Run your file.You should see all the text files containing all the speeches of the US presidents that the # NLTK has saved inside it. # Now add the lines: print "=============Words in Obama's Speech ======" print inaugural.words("2009-Obama.txt") # Returns a list of all the words in Obama's speech print "=============Words in Bush's speech ======" print inaugural.sents("2005-Bush.txt") # Returns a list of all the sentences in Bush's speech # As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech. # Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech. # === Part 2: Analysing tokens (words) of a text === # The term 'token' means a word or a punctuation mark. # After you've done that, add the following lines to your program from nltk.book import * # This may take a while to load. NLTK has many texts stored in it!
nCatgs[n] = len(reuters.categories(name)) catgs[n] = ','.join(reuters.categories(name)) texts[n] = ' '.join(reuters.words(name)) # trimming articles without categories toTrim = np.invert(np.equal(catgs, None)) catgs = catgs[toTrim] texts = texts[toTrim] nCatgs = nCatgs[toTrim] outNames = outNames[toTrim] for n in range(len(outNames)): with open('./reuters/' + outNames[n] + '.txt', 'w') as f: f.writelines('\n'.join(textwrap.wrap(texts[n], 80))) out = np.vstack((outNames, catgs)).T out = out[np.argsort(out[:, 0])] np.savetxt('reuters_catgs.csv', out, fmt='%s', delimiter=',') ## Save inaugural addresses #nltk.download('inaugural') from nltk.corpus import inaugural as inaug adds = inaug.fileids() texts = np.empty(len(adds), dtype=object) # pre-allocate for n, name in enumerate(adds): texts[n] = ' '.join(inaug.words(name)) with open('./inaugural/' + name, 'w') as f: tmp = textwrap.wrap(texts[n], 80) f.writelines('\n'.join(tmp).encode('ascii', 'ignore'))
from nltk.book import * import nltk print(text1.vocab()) print(type(text1)) print(len(text1)) from nltk.corpus import gutenberg print(gutenberg.fileids()) print(nltk.corpus.gutenberg.fileids()) hamlet = gutenberg.words('shakespeare-hamlet.txt') from nltk.corpus import inaugural print(inaugural.fileids()) print(nltk.corpus.inaugural.fileids()) from nltk.text import Text former_president = Text(inaugural.words(inaugural.fileids()[-1])) print(' '.join(former_president.tokens[0:1000]))
import nltk from nltk.corpus import inaugural from nltk.util import ngrams obama_words = inaugural.words("2009-Obama.txt") george_words = inaugural.words("1789-Washington.txt") fd_george_words = nltk.FreqDist(w.lower() for w in george_words) fd_obama_words = nltk.FreqDist(w.lower() for w in obama_words) #fd_obama_words.plot(50) print(fd_obama_words.most_common(50)) print(fd_george_words.most_common(50)) obama = [x[0] for x in fd_obama_words.most_common(50)] george = [x[0] for x in fd_george_words.most_common(50)] print(list(set(obama) & set(george)))
from nltk.corpus import inaugural from nltk import FreqDist #nltk.download('stopwords') from nltk.tokenize import regexp_tokenize print("-------WARM UP---------") print("------TASK 1---------") #using inaugural fileids to list all the documents documents = inaugural.fileids() print( "Using the corpus reader class list all the documents in inaugural corpus :" ) print(documents) print("---------------------------------------------------------------------") print("Find the total number of words in Clinton’s 1993 speech :") #using .worrds method to count words in clinton speech clintonwords = (inaugural.words('1993-Clinton.txt')) print(len(clintonwords)) #.raw method will read the text in raw form s = inaugural.raw('1789-Washington.txt') w = set(m.group(0) for m in re.finditer(r"\w+", s)) #print (len(re.findall('\w+', s))) print("Find the total number of distinct words in the same speech :") #now we will find length of distinct words print(len(w)) # average function to calculate average word length def average(numbers): return sum(numbers) / len(numbers)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 20 08:22:33 2018 @author: jacobjohn //https://www.jasondavies.com/wordcloud/ """ import nltk import re from nltk.corpus import inaugural Obama = inaugural.words(fileids='2009-Obama.txt') #declare a dictionary word_freq = {} for tok in Obama: if tok in word_freq: word_freq[tok] += 1 else: word_freq[tok] = 1 max_dict = {} while len(max_dict) < 5: max_val = 0 for key in word_freq: if max_val < word_freq[key] and re.match(r'[A-Za-z]+', key) and key not in max_dict:
Created on Sun Dec 24 11:00:43 2017 @author: Mohnish_Devadiga """ import nltk from nltk.corpus import inaugural import pandas as pd import matplotlib inaugural.fileids() #print(inaugural.fileids()) for speech in inaugural.fileids(): word_count_total = len(inaugural.words(speech)) print(speech, word_count_total) #Go through all speech speech_length = [(len(inaugural.words(speech)), speech) for speech in inaugural.fileids()] print(speech_length) #Get the max and min speech print("Max is : ", max(speech_length)) print("Min is : ", min(speech_length)) #Avg no of words per sentence for each speech for speech in inaugural.fileids(): word_total = len(inaugural.words(speech))
def conlisttodic(lst): dct= dict() for i in range(0, len(lst)): for j in range(0,1): dct.update({lst[i][j]:lst[i][j+1]}) return dct def makeStopWords(): sw = stopwords.words('english') for i in wordsStop : sw.append(i)ห return sw stopWord = makeStopWords() for fileID in inaugural.fileids()[-12:]: wordList=list() for word in inaugural.words(fileID): word = word.lower() if word.isalpha() and word not in stopWord: wordList.append(word) speech[fileID] = nltk.FreqDist(wordList) # print(type(speech)) for i,k in speech.items() : print(i,k) nameyear.append(i) worddict[countloop] = k countloop = countloop+1 intersectionn= worddict[0] & worddict[1] & worddict[2] &worddict[3] & worddict[4] & worddict[5] &worddict [6] & worddict[7] & worddict[8] & worddict[9] & worddict[10] & worddict[11] intersectionnsort = sorted(intersectionn) for i in range(len(nameyear)): for j in intersectionnsort:
# In[4]: print(brown.categories()) # In[5]: brown.words(categories='romance') # In[6]: from nltk.corpus import inaugural # In[7]: inaugural.fileids() # In[8]: inaugural.words(fileids='1989-Bush.txt') # In[9]: inaugural.words(fileids='1989-Bush.txt')[:50] # In[10]: from nltk.tokenize import TweetTokenizer text = "Mexico is paying (indirectly) for the Wall through the new USMCA, the replacement for NAFTA! Far more money coming to the U.S. Because of the tremendous dangers at the Border, including large scale criminal and drug inflow, the United States Military will build the Wall!" twt = TweetTokenizer() print(twt.tokenize(text))
from nltk.corpus import inaugural, stopwords # from nltk.stem.wordnet import WordNetLemmatizer from gensim import corpora, models, similarities import re import string filenames = inaugural.fileids() # lmtzr = WordNetLemmatizer() filtered_speeches = [] def removeNonAscii(s): return "".join(i for i in s if ord(i)<128) for filename in filenames: print filename print "Reading in raw words..." raw_words = inaugural.words(filename) print "Removing stop words..." filtered_words = [word for word in raw_words if not word in stopwords.words('english')] print "Removing punctuation..." filtered_words = [word.strip(string.punctuation) for word in filtered_words] filtered_words = [word.lower() for word in filtered_words if word != ""] tokens_once = set(word for word in set(filtered_words) if filtered_words.count(word) == 1) filtered_words = [removeNonAscii(word) for word in filtered_words if word not in tokens_once] print "Appending filtered words..." filtered_speeches.append(filtered_words) print "making numbered corpus..." dictionary = corpora.Dictionary(filtered_speeches) corpus = [dictionary.doc2bow(text) for text in filtered_speeches] tfidf = models.TfidfModel(corpus)
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")
def main(): #store FreqDist's #index is the length of the word, 0 is for all words samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" brown_letters = FreqDist() web_letters = FreqDist() inaugural_letters = FreqDist() gutenberg_letters = FreqDist() genesis_letters = FreqDist() for file in gutenberg.fileids(): for word in gutenberg.words(file): for character in word: if(character in string.letters): gutenberg_letters[character.upper()] += 1 for file in brown.fileids(): for word in brown.words(file): for character in word: if(character in string.letters): brown_letters[character.upper()] += 1 for file in webtext.fileids(): for word in webtext.words(file): for character in word: if(character in string.letters): web_letters[character.upper()] += 1 for file in inaugural.fileids(): for word in inaugural.words(file): for character in word: if(character in string.letters): inaugural_letters[character.upper()] += 1 for file in genesis.fileids(): for word in genesis.words(file): for character in word: if(character in string.letters): genesis_letters[character.upper()] += 1 with open("genesis-letter-freq.txt",'w') as f: sys.stdout = f f.write("GENESIS\n") for let in samples: print(str(genesis_letters[let])) with open("gutenberg-letter-freq.txt", 'w') as f: sys.stdout = f f.write("GUTENBERG\n") for let in samples: print(str(gutenberg_letters[let])) with open("webtext-letter-freq.txt", 'w') as f: sys.stdout = f f.write("WEBTEXT\n") for let in samples: print(str(web_letters[let])) with open("inaugural-letter-freq.txt", 'w') as f: sys.stdout = f f.write("INAUGURAL\n") for let in samples: print(str(inaugural_letters[let])) with open("brown-letter-freq.txt", 'w') as f: sys.stdout = f f.write("BROWN\n") for let in samples: print(str(brown_letters[let])) with open("letter-freq.txt", 'w') as f: corpora = [gutenberg_letters, web_letters, inaugural_letters, brown_letters, genesis_letters] f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n") for let in samples: for corpus in corpora: f.write(str(corpus[let]) + ",") f.write("\n")
print(md[:8]) print("Length of book {}".format(len(md))) print("Boat: {}".format(md.count('boat'))) md_set = set(md) print("Unique by set: {}".format(len(md_set))) print("Average by words: {}".format(len(md) / len(md_set))) md_sents = nltk.corpus.gutenberg.sents("melville-moby_dick.txt") print("Average by words per sentence: {}".format(len(md) / len(md_sents))) print("inaugral Ids:\n{}".format(inaugural.fileids())) for speech in inaugural.fileids(): words_total = len(inaugural.words(speech)) print("Speech: {0} has total words: {1}".format(speech, words_total)) speech_len = [(len(inaugural.words(speech)), speech) for speech in inaugural.fileids()] print("Biggest Speech: {}".format(max(speech_len))) print("shortest Speech: {}".format(min(speech_len))) for speech in inaugural.fileids(): words_total = len(inaugural.words(speech)) sentence_total = len(inaugural.sents(speech)) print("Sentence average: {}".format(words_total / sentence_total)) data = pd.DataFrame([ int(speech[:4]), len(inaugural.words(speech)) / len(inaugural.sents(speech))
# In[7]: #INAUGURAL CORPUS # In[8]: from nltk.corpus import inaugural # In[9]: inaugural.fileids() # In[10]: inaugural.words(fileids = '2009-Obama.txt') # In[11]: inaugural.words(fileids = '2009-Obama.txt')[:23]
def word_fdist(inaug_list): fixedspeech=map(str.lower, inaugural.words(inaug_list)) fixedspeech=filter(checkwords, fixedspeech) fdist = FreqDist(fixedspeech) return fdist
from nltk.corpus import inaugural import matplotlib.pyplot as plt x=inaugural.words('2009-Obama.txt') l={} new=[] k={} z=set(x) for word in z: l[word]=x.count(word) #print(l) from nltk.stem import PorterStemmer ps=PorterStemmer() for words in x: new.append(ps.stem(words)) p=set(new) for w in p: k[w]=new.count(w) plt.plot(k.values()) #plt.xlabel(k.keys()) k_sorted = sorted(k.items(), key=operator.itemgetter(1),reverse=True) for word,count in k.items(): if(count==max(k.values())): print(word) print(k_sorted[0])
def chi_square(word_one, word_two, corpus): word_list = [] #Import the necessary corpus if corpus == "brown": from nltk.corpus import brown word_list = brown.words() elif corpus == "reuters": from nltk.corpus import reuters word_list = reuters.words() elif corpus == "gutenberg": from nltk.corpus import gutenberg word_list = gutenberg.words() elif corpus == "webtext": from nltk.corpus import webtext word_list = webtext.words() elif corpus == "inaugural": from nltk.corpus import inaugural word_list = inaugural.words() #Get the frequencies of each word w1 = word_list.count(word_one) w2 = word_list.count(word_two) #Get the frequencies of the word as a collocation bigrams = nltk.bigrams(word_list) freq_dist = nltk.FreqDist(bigrams) w1w2 = 0 w1andnotw2 = 0 notw1andw2 = 0 notw1andnotw2 = 0 total_words = len(word_list) for k, v in freq_dist: if k == word_one and v == word_two: w1w2 = w1w2 + 1 elif k == word_one and v != word_two: w1andnotw2 = w1andnotw2 + 1 elif k != word_one and v == word_two: notw1andw2 = notw1andw2 + 1 notw1andnotw2 = notw1andw2 + w1andnotw2 totalw1andw2 = w1w2 + w1andnotw2 + notw1andw2 + notw1andnotw2 first_row = w1w2 + w1andnotw2 second_row = notw1andw2 + notw1andnotw2 first_col = w1w2 + notw1andw2 second_col = w1andnotw2 + notw1andnotw2 #Calculate chi-square value #Null hypothesis is that there is no collocation between the two words (no relationship) #Estimated value for each cell value_one = (first_row * first_col) / totalw1andw2 value_two = (second_row * first_col) / totalw1andw2 value_three = (first_row * second_col) / totalw1andw2 value_four = (second_row * second_col) / totalw1andw2 x2 = ((w1w2 - value_one)**2) / value_one x2 = x2 + ((w1andnotw2 - value_three)**2) / value_three x2 = x2 + ((notw1andnotw2 - value_four)**2) / value_four x2 = x2 + ((notw1andw2 - value_two)**2) / value_two #Print out the results print("C(w1): ", w1) print("C(w2): ", w2) print("C(w1w2): ", w1w2) print("C(w1 && !w2)", w1andnotw2) print("C(!w1 && w2)", notw1andw2) print("C(!w1 && !w2)", notw1andnotw2) print("Total Words: ", total_words) print("") print("0.05% Baseline: 3.841") print("X^2:", x2) collocation = None degrees_of_freedom = 1 if x2 <= 3.841: collocation = True else: collocation = False print("Do we have a collocation?", collocation)
#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus print(str(nltk.corpus.brown).replace('\\\\', '/')) # The Penn Treebank Corpus: print(str(nltk.corpus.treebank).replace('\\\\', '/')) # The Name Genders Corpus: print(str(nltk.corpus.names).replace('\\\\', '/')) # The Inaugural Address Corpus: print(str(nltk.corpus.inaugural).replace('\\\\', '/')) print(str(nltk.corpus.treebank.fileids())) # doctest: +ELLIPSIS #print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS # Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus. from nltk.corpus import inaugural print(inaugural.raw('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.words('1789-Washington.txt')) print(inaugural.sents('1789-Washington.txt')) # doctest: +ELLIPSIS print(inaugural.paras( '1789-Washington.txt')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # l1 = len(inaugural.words('1789-Washington.txt')) l2 = len(inaugural.words('1793-Washington.txt')) l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) print('%s+%s == %s' % (l1, l2, l3)) print(len(inaugural.words())) print(inaugural.readme())
# In[13]: from nltk.corpus import inaugural # In[15]: inaugural.fileids() # In[16]: len(inaugural.fileids()) # In[19]: inaugural.words(fileids='1861-Lincoln.txt')[:20] # In[20]: print(len(inaugural.words(fileids='1861-Lincoln.txt'))) # In[24]: inaugural.words(fileids='2009-Obama.txt')[:5] # In[22]: print(len(inaugural.words(fileids='2009-Obama.txt'))) # In[26]:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # import nltk # nltk.download('inaugural') import os from nltk.corpus import inaugural corpus_from_paragraphs = inaugural.paras(os.path.dirname(__file__) + '/dataset/paragraphs.txt') corpus_from_sentences = inaugural.sents(os.path.dirname(__file__) + '/dataset/sentences.txt') corpus_from_words = inaugural.words(os.path.dirname(__file__) + '/dataset/words.txt') l1 = len(corpus_from_paragraphs) l2 = len(corpus_from_sentences) l3 = len(corpus_from_words) # l2 = 0 # l3 = 0 print('paragraphs: %s, sentences: %s, words: %s' % (l1, l2, l3)) # print(inaugural.readme())
# ## INAUGURAL CORPUS # In[10]: from nltk.corpus import inaugural # In[11]: inaugural.fileids() # ### LINCOLN # In[15]: inaugural.words(fileids='1861-Lincoln.txt') # In[16]: inaugural.words(fileids='1861-Lincoln.txt')[:5] # ### OBAMA # In[18]: inaugural.words(fileids='2009-Obama.txt') # In[21]: inaugural.words(fileids='2009-Obama.txt')[:20]
from nltk.corpus import reuters reuters.fileids() reuters.categories(['training/9865', 'training/8666']) reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(categories=['corn', 'barley']) from nltk.corpus import inaugural inaugural.fileids() inaugYears = [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() from nltk.corpus import udhr languages = [ 'English', 'Finnish_Suomi', 'Italian_Italiano', 'Greenlandic_Inuktikut' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=False, title='Declaration of Human Rights') from nltk.corpus import PlaintextCorpusReader
#You must work in your Dropbox folder so we can see your progress. #Run your file everytime something new is added so you can see how it works. #There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py # === Part 1: Importing Corpuses === import nltk from nltk.corpus import inaugural print inaugural.fileids() #Run your file.You should see all the text files containing all the speeches of the US presidents that the #NLTK has saved inside it. #Now add the lines: print "=============Words in Obama's Speech ======" print inaugural.words( '2009-Obama.txt') #Returns a list of all the words in Obama's speech print "=============Words in Bush's speech ======" print inaugural.sents( '2005-Bush.txt') #Returns a list of all the sentences in Bush's speech #As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech. #Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech. # === Part 2: Analysing tokens (words) of a text === #The term 'token' means a word or a punctuation mark. #After you've done that, add the following lines to your program from nltk.book import *
def cfd(text, tgt_list): from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in tgt_list if w.lower().startswith(target)) #cfd.plot() return cfd
############# #Corpus data# ############# # Inaugural Address Corpus from nltk.corpus import inaugural inaugural.fileids()[:2] [fileid[:4] for fileid in inaugural.fileids()] #How the words America and citizen are used over time. cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'war'] if w.lower().startswith(target)) cfd.plot() #cfd.tabulate() from nltk.corpus import brown news_words = brown.words(categories="news") print(news_words) freq = nltk.FreqDist(news_words) freq.plot(30) from nltk import FreqDist verbs = ["should", "may", "can"] genres = ["news", "government", "romance"] for g in genres:
import nltk text = '''Donald John Trump (born June 14, 1946) is the 45th and current President of the United States. Before entering politics, he was a businessman and television personality. ''' grammer = 'Chunk:{<PRP><VB.+><DT>?<NN.?>}' pos = nltk.pos_tag(nltk.word_tokenize(text)) parser = nltk.RegexpParser(grammer) chunked = parser.parse(pos) #input for parse is list of tuples for i in chunked.subtrees(): if i.label() == "Chunk": print(i.leaves()) from nltk.corpus import inaugural text = inaugural.words() pos = nltk.pos_tag(text) # JJ +NN but the NN will be 'people' # !!!!!! important always allow DT iin this kind of structures jjlist = [] parser = nltk.RegexpParser('chunk:{<JJ.?>+<NN.*>}') chunk = parser.parse(pos) for i in chunk.subtrees(): if i.label() == "chunk" and i.leaves()[-1][0] == 'people': # i.leaves()=[('american', 'JJ),('people', 'NN')] #i.leaves()[-1] = ('people','NN') jj = [x[0] for x in i.leaves()[:-1]] jjlist += jj frequency = nltk.FreqDist(jjlist)
print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words("melville-moby_dick.txt")) print("text1:", text1.name) text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words("chesterton-thursday.txt"))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 27 08:19:21 2018 @author: jacobjohn """ import nltk from nltk.corpus import inaugural import matplotlib.pyplot cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) #first four characters - years for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot()
def answers(): _rvals = [] #### Question 1 #### print '##### Question 1 #####' print '(see code - lines 64-65)' print '(NB: the two variables are returned by this function)' _bush01 = inaugural.words('2001-Bush.txt') bush01_word_lengths = _lengths(_vocabulary(_bush01)) fd_bush01_words = FreqDist(_nopunct(_bush01)) _rvals.append(bush01_word_lengths) _rvals.append(fd_bush01_words) #### Question 2 #### print '\n##### Question 2 #####' bush01_top10_words = _firsts(fd_bush01_words.items()[:10]) bush01_average_word_lengths = _avg(bush01_word_lengths) _obama09 = inaugural.words('2009-Obama.txt') _fd_obama09_words = FreqDist(_nopunct(_obama09)) _obama09_word_lengths = _lengths(_vocabulary(_obama09)) obama09_top10_words = _firsts(_fd_obama09_words.items()[:10]) obama09_average_word_lengths = _avg(_obama09_word_lengths) print 'top10 words Bush (2001): ', _str(bush01_top10_words) print 'top10 words Obama (2009):', _str(obama09_top10_words) print 'average word length Bush (2001): ', bush01_average_word_lengths print 'average word length Obama (2009):', obama09_average_word_lengths #### Question 3 #### print '\n##### Question 3 #####' bush01_token_lengths = _avg(_lengths(_nopunct(_bush01))) obama09_token_lengths = _avg(_lengths(_nopunct(_obama09))) print 'average token length Bush (2001): ', bush01_token_lengths print 'average token length Obama (2009):', obama09_token_lengths #### Question 4 #### print '\n##### Question 4 #####' for _fileid in inaugural.fileids(): _year = int(_fileid.split('-')[0]) _vocab_size = number_of_word_types(_fileid) print 'year %d: %d word types' % (_year, _vocab_size) #### Question 5 #### print '\n##### Question 5 #####' fd_bush01_nostop = FreqDist(_nostops(_nopunct(_bush01))) fd_obama09_nostop = FreqDist(_nostops(_nopunct(_obama09))) bush01_top10_nostop = _firsts(fd_bush01_nostop.items()[:10]) obama09_top10_nostop = _firsts(fd_obama09_nostop.items()[:10]) print 'top10 non-stop-words Bush (2001): ', _str(bush01_top10_nostop) print 'top10 non-stop-words Obama (2009):', _str(obama09_top10_nostop) #### Question 6 #### print '\n##### Question 6 #####' _wash89 = inaugural.words('1789-Washington.txt') fd_wash89_nostop = FreqDist(_nostops(_nopunct(_wash89))) wash89_top10_nostop = _firsts(fd_wash89_nostop.items()[:10]) print 'top10 non-stop-words Washington (1789):', _str(wash89_top10_nostop) #### Question 7 #### print '\n##### Question 7 #####' wash89_rank_country = rank(fd_wash89_nostop, 'country') obama09_rank_country = rank(fd_obama09_nostop, 'country') bush01_rank_country = rank(fd_bush01_nostop, 'country') print 'rank of "country" in Washington (1789):', wash89_rank_country print 'rank of "country" in Obama (2009):', obama09_rank_country print 'rank of "country" in Bush (2001):', bush01_rank_country #### Question 8 #### print '\n##### Question 7 #####' print '(see comments in "rank" function on lines 20-45)' #### Question 9 #### print '\n##### Question 9 #####' print '(see plot)' ff = inaugural.fileids() fdd = {} _years = [] for _fileid in ff: fdd[_fileid] = FreqDist(_nostops(inaugural.words(_fileid))) _years.append(_fileid[0:4]) pylab.plot([(lambda d: len(d) / float(d.N()))(fdd[f]) for f in ff]) pylab.xticks(range(len(ff)), _years, rotation=90) pylab.xlim(0, len(ff) - 1) pylab.ylabel('ratio of word types to tokens (without stop-words)') pylab.xlabel('time') pylab.title('f(time) = #(word types) / #(word tokens)') pylab.show() #### Question 10 #### print '\n##### Question 10 #####' print '(see plot)' obama09top10_butnot_wash89top10 = [word for word in obama09_top10_nostop if word in fd_wash89_nostop and word not in wash89_top10_nostop] wash89top10_butnot_obama09top10 = [word for word in wash89_top10_nostop if word in fd_obama09_nostop and word not in obama09_top10_nostop] obama09_word = 'world' wash89_word = 'government' assert(wash89_word in wash89top10_butnot_obama09top10) assert(obama09_word in obama09top10_butnot_wash89top10) normalisation_justification = (\ "We normalise for different sizes in vocabulary by dividing the rank of " "some word by the size of the vocabulary in that speech" "Since rank is in relation with vocabulary size, this is similar to " "getting the maximum rank over all speeches and dividing each rank by that " "quantity") print normalisation_justification _normalised_rank = lambda f, w: min(1, rank(fdd[f], w) / \ float(len(_vocabulary(_nostops(fdd[f]))))) pylab.plot([_normalised_rank(f, obama09_word) for f in ff], label=obama09_word, color='b') pylab.plot([_normalised_rank(f, wash89_word) for f in ff], label=wash89_word, color='r') pylab.xticks(range(len(ff)), _years, rotation=90) pylab.xlim(0, len(ff) - 1) pylab.ylabel('normalised word rank (lower is better)') pylab.xlabel('time') pylab.title('f(time) = word rank / vocabulary size') pylab.legend() pylab.show() #### Question 11 #### print '\n##### Question 11 #####' observations_on_plots = (\ "We observe that the rank of 'world' is noisy when observed on the level " "of some individual year/inaugural speech. However, when looking at the " "larger picture, a trend emerges: 'world''s rank is consistenlty getting " "higher over time - an indicator for an ever-globalising and shrinking " "world?" "\n" "We observe that 'government' is a consistently highly ranked word across " "time - expcept for some few inaugural speeches where it has a very low " "rank. Those speeches are around the early 1800s (abolishment of slavery)," " 1860s-70s (US civil war), the early 1900s (Word War One), and 1937-1981" "(World War Two + Cold War) - it would seem that presidents don't want to " "remind their subjugates of the government during hard times. Outliers to " "this theory can be explained easily (e.g. somewhat high rank of " "'government' in 1949 = a certain 'evil government' being defeated).") print observations_on_plots return _rvals
for word1, word2 in product(word_list, word_list): count = 0 n_grams = ngrams(brown.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(treebank.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(inaugural.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(names.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(gutenberg.words(), n) for grams in n_grams: if word1 in grams and word2 in grams:
from nltk.corpus import inaugural as inag from nltk import ConditionalFreqDist as CondFreqDist cfd = CondFreqDist([(target , fileid[:4])\ for fileid in inag.fileids() \ for word in inag.words(fileid) \ for target in ["wealth" , "peace" , "harmony" , "prosperous"] if word.lower().startswith(target) ]) cfd.plot()
print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt'))
__author__ = 'auroua' from nltk.corpus import inaugural from nltk.corpus import stopwords import numpy as np import matplotlib.pyplot as plt from lda_1 import LDA import seaborn as sns stops = set(stopwords.words("english")) vocab = dict() for fileid in inaugural.fileids(): for word in inaugural.words(fileid): word = word.lower() if word not in stops and word.isalpha(): if word not in vocab: vocab[word] = 0 vocab[word] += 1 """ Sort the vocab keep only words which occur more than 50 times Then Create word to id and id to word dictionaries """ vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True)) wordids = {v[0]: i for i, v in enumerate(vocab_sorted)} idwords = {i: v[0] for i, v in enumerate(vocab_sorted)} vocab_size = len(wordids) print vocab_size # Generate corpus document vectors data = []
# from nltk.corpus import gutenberg # # print(gutenberg.fileids()) # allwords = gutenberg.words('shakespeare-hamlet.txt') # print(len(allwords)) # print(len(set(allwords))) # print(allwords.count('Hamlet')) # A = set(allwords) # longwords = [w for w in A if len(w)>12] # print(sorted(longwords)) # # from nltk.probability import * # fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) # print(fd2.B()) # print(fd2.N()) # # fd2.tabulate(20) # fd2.plot(20) # # fd2.plot(20,cumulative = True) from nltk.corpus import inaugural fd3 = FreqDist([s for s in inaugural.words()]) print(fd3.freq('freedom')) cfd = ConditionalFreqDist( (fileid,len(w)) for fileid in inaugural.fileids() for w in inaugural.words(fileid) if fileid>'1980' and fileid<'2010' ) print(cfd.items()) cfd.plot()
webtext_words = webtext.words() print(webtext_words) # Pick out the text from np_chat corpus and name it as nps_chat_raw nps_chat_raw = nps_chat.raw() # Pick out the text from brown corpus and name it as brown_raw brown_raw = brown.raw() print(brown_raw) # Pick out the text from reuters corpus and name it as reuters_words reuters_words = reuters.words() print(reuters_words) # Pick out the text from inaugural corpus and name it as inaugral_raw inaugral_words = inaugural.words() print(inaugral_words) # Creating a variable for tokenizing words tokenizer = RegexpTokenizer(r'\w+') # Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens tokens = tokenizer.tokenize(gutenberg_raw) # Assigning the stopwords to a variable s s=set(stopwords.words('english')) # Removing the stopwords from gutenberg file gutenberg_filtered = filter(lambda w: not w in s,tokens)
from nltk.corpus import reuters reuters.fileids() reuters.categories(['training/9865', 'training/8666']) reuters.fileids(['barley','corn']) reuters.words('training/9865')[:14] reuters.words(categories = ['corn','barley']) from nltk.corpus import inaugural inaugural.fileids() inaugYears = [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target)) cfd.plot() from nltk.corpus import udhr languages = ['English','Finnish_Suomi','Italian_Italiano', 'Greenlandic_Inuktikut'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative = False, title = 'Declaration of Human Rights') from nltk.corpus import PlaintextCorpusReader corpusRoot = '/home/mv/Dropbox/Computer/UbuntuInstall' wordlists = PlaintextCorpusReader(corpusRoot,'.*')
Created on Sun Dec 24 11:00:43 2017 @author: Mohnish_Devadiga """ import nltk from nltk.corpus import inaugural import pandas as pd import matplotlib inaugural.fileids() #print(inaugural.fileids()) for speech in inaugural.fileids(): word_count_total = len(inaugural.words(speech)) print(speech , word_count_total) #Go through all speech speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()] print(speech_length) #Get the max and min speech print("Max is : ",max(speech_length)) print("Min is : ",min(speech_length)) #Avg no of words per sentence for each speech for speech in inaugural.fileids(): word_total = len(inaugural.words(speech)) Sents_total = len(inaugural.sents(speech))
#importing library from nltk.corpus import inaugural # In[6]: inaugural.fileids() # In[7]: #printing inaugral words for some text for i in inaugural.words('1933-Roosevelt.txt'): print(i, end = " ") # In[8]: ''' College is so hectic,I'm tired ''' # In[9]: #importing library from nltk.corpus
def text4(): text = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text.name) return text
from itertools import groupby from nltk import pos_tag from nltk.chunk import ne_chunk from nltk.corpus import inaugural from nltk.tag import StanfordNERTagger from nltk.tree import Tree # Uncomment to check the required StanfordNERTagger environment variables. # print os.environ.get("CLASSPATH") # print os.environ.get("STANFORD_MODELS") # Read the corpus and POS tag it. POS_tagging = pos_tag(inaugural.words()) # Process the corpus with the NLTK named entity classifier. ne_nltk = ne_chunk(POS_tagging) # Filter out in a list only the organization entities. Join by space words that are part of the same organization entity (same Tree object). nltk_organizations = [ " ".join(w[0] for w in el) for el in ne_nltk if (type(el) == Tree and el.label() == "ORGANIZATION") ] # Remove duplicates. nltk_organizations = set(nltk_organizations) # Filter out in a list only the person entities. Join by space words that are part of the same person entity (same Tree object). nltk_persons = [ " ".join(w[0] for w in el) for el in ne_nltk if (type(el) == Tree and el.label() == "PERSON")
cfd.plot(cumulative = True) cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True) #条件频率分布 genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] cfd = nltk.ConditionalFreqDist(genre_word) cfd.conditions() list(cfd['romance']) cfd['romance']['could'] from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) #随机产生文本 def generate_model(cfdist, word, num=15): for i in range(num): print word, word = cfdist[word].max() text = nltk.corpus.genesis.words('english-kjv.txt') bigrams = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(bigrams) print cfd['living'] generate_model(cfd, 'living') def unusual_words(text):
genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(len(genre_word)) # 170576 个词类 print(genre_word[:4]) # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')] # [_start-genre] print(genre_word[-4:]) # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')] # [_end-genre] cfd = ConditionalFreqDist(genre_word) print(cfd) # <ConditionalFreqDist with 2 conditions> print(cfd.conditions()) # ['news', 'romance'] # [_conditions-cfd] print(cfd['news']) # <FreqDist with 14394 samples and 100554 outcomes> print(cfd['romance']) # <FreqDist with 8452 samples and 70022 outcomes> print(cfd['romance'].most_common(2)) # [(',', 3899), ('.', 3736)] print(cfd['romance']['could']) # 193 print(cfd['romance'].max()) # 找到 romance 中最大的 print(cfd['romance'][',']) # 3899 ################################################################## ## plot() how the words America and citizen are used over time; 美国总统就职演讲, 使用 America 和 citizen 情况 cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for word in inaugural.words(fileid) for target in ['america', 'citizen'] if word.lower().startswith(target)) cfd.plot() # 绘制演讲中出现 America 和 citizen 次数 ################################################################## ## tabulate(); 提取词对 # Next, let's combine regular expressions with conditional frequency distributions. # Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair, # it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair: rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] print(cvs[:10]) # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko'] cfd = ConditionalFreqDist(cvs) cfd.tabulate() # a e i o u # k 418 148 94 420 173 # p 83 31 105 34 51 # r 187 63 84 89 79
def process_speech(filename): text = inaugural.words(filename) text = remove_punctuation(text) text = remove_stopwords(text) text = clean(text) return text
# -*- coding: utf-8 -*- """ Created on Fri Dec 20 16:45:50 2019 @author: Ritwik Gupta """ #20/12/19 from nltk.corpus import brown brown.categories() print(brown.words(categories='hobbies')[0:5]) from nltk.corpus import inaugural inaugural.fileids() inaugural.words(fileids='1933-Roosevelt.txt')[0:10] from nltk.corpus import webtext d1 = {} for i in webtext.fileids(): d1[i] = webtext.words(fileids=i)[:20] #Downloaded the MASC data import nltk with open('tweets1.txt', 'r') as f: text = f.read().strip() text1 = text.split() text2 = nltk.Text(text1) text2.concordance("good", 1) #Project Gutenberg