def answers(): ### Question 1 print "*** Question 1 ***" print "Top 50 tokens for the inaugural corpus:" answer1a = q1(inaugural, inaugural.fileids(), 50) print answer1a print "Top 50 tokens for the twitter corpus:" answer1b = q1(xtwc, twitter_file_ids, 50) print answer1b ### Question 2 print "*** Question 2 ***" corpus_tokens = get_corpus_tokens(inaugural, inaugural.fileids()) answer2a = clean_tokens(corpus_tokens) print "Inaugural Speeches:" print "Number of tokens in original corpus: " + str(len(corpus_tokens)) print "Number of tokens in cleaned corpus: " + str(len(answer2a)) print "First 100 tokens in cleaned corpus:" print answer2a[:100] print "-----" corpus_tokens = get_corpus_tokens(xtwc, twitter_file_ids) answer2b = clean_tokens(corpus_tokens) print "Twitter:" print "Number of tokens in original corpus: " + str(len(corpus_tokens)) print "Number of tokens in cleaned corpus: " + str(len(answer2b)) print "First 100 tokens in cleaned corpus:" print answer2b[:100] print "Top 50 tokens for the cleaned inaugural corpus:" answer2c = q2(answer2a, 50) print answer2c print "Top 50 tokens for the cleaned twitter corpus:" answer2d = q2(answer2b, 50) print answer2d ### Question 3 print "*** Question 3 ***" answer3 = q3() print answer3[:280] ### Question 4 print "*** Question 4: building brown bigram letter model ***" brown_bigram_model = q4(brown) ### Question 5 print "*** Question 5 ***" answer5 = q5("20100128.txt", brown_bigram_model) print "Top 10 entropies:" print answer5[:10] print "Bottom 10 entropies:" print answer5[-10:] ### Question 6 print "*** Question 6 ***" answer6 = q6(answer5) print "Mean: " + str(answer6[0]) print "Standard Deviation: " + str(answer6[1]) print "ASCII tweets: Top 10 entropies:" print answer6[2][:10] print "ASCII tweets: Bottom 10 entropies:" print answer6[2][-10:] print "Probably not English tweets: Top 10 entropies:" print answer6[3][:10] print "Probably not English tweets: Bottom 10 entropies:" print answer6[3][-10:]
def answers(): ### Question 1 print "*** Question 1 ***" answer1a = q1(inaugural, inaugural.fileids()) print "Average token length for inagural corpus: " + str(answer1a) ''' For some reason it doesn't want to print anything for 1b, therefore I commented it out, it will print anything else answer1b = q1(xtwc,twitter_file_ids) print "Average token length for twitter corpus: " + str(answer1b) ''' ### Question 2 print "*** Question 2 ***" answer2 = q2() print answer2 ### Question 3 print "*** Question 3 ***" print "Top 50 tokens for the inagural corpus:" answer3a = q3(inaugural, inaugural.fileids(), 50) print answer3a print "Top 50 tokens for the twitter corpus:" answer3b = q3(xtwc, twitter_file_ids, 50) print answer3b ### Question 4 print "*** Question 4 ***" corpus_tokens = get_corpus_tokens(inaugural, inaugural.fileids()) answer4a = q4(corpus_tokens) print "Inaugural Speeches:" print "Number of tokens in original corpus: " + str(len(corpus_tokens)) print "Number of tokens in cleaned corpus: " + str(len(answer4a)) print "First 100 tokens in cleaned corpus:" print answer4a[:100] print "-----" corpus_tokens = get_corpus_tokens(xtwc, twitter_file_ids) answer4b = q4(corpus_tokens) print "Twitter:" print "Number of tokens in original corpus: " + str(len(corpus_tokens)) print "Number of tokens in cleaned corpus: " + str(len(answer4b)) print "First 100 tokens in cleaned corpus:" print answer4b[:100] ### Question 5 print "*** Question 5 ***" print "Top 50 tokens for the cleaned inagural corpus:" answer5a = q5(answer4a, 50) print answer5a print "Top 50 tokens for the cleaned twitter corpus:" answer5b = q5(answer4b, 50) print answer5b ### Question 6 print "*** Question 6 ***" answer6 = q6() print answer6 ### Question 7 print "*** Question 7: building brown bigram letter model ***" brown_bigram_model = q7(brown) '''
def fun11(): """inaugural address corpus""" print inaugural.fileids() print[fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist((target, fileid[:4]) \ for fileid in inaugural.fileids() \ for w in inaugural.words(fileid) \ for target in ['america', 'citizen'] \ if w.lower().startswith(target)) cfd.plot()
def fun8(): from nltk.corpus import inaugural print inaugural.fileids() print[w[:4] for w in inaugural.fileids()] cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() # 条件频率分布图
def inaugural(): inaugural.fileids() [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot()
def exercise_inaugural(): print(inaugural.fileids()) # 提取每个演讲文本的年代名 print([file_id[:4] for file_id in inaugural.fileids()]) # 观察词汇america和citizen在不同年份演讲中的出现频率 cfd = nltk.ConditionalFreqDist((target, file_id[:4]) for file_id in inaugural.fileids() for w in inaugural.words(file_id) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot()
def compare(word, word2): cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in [word, word2] if w.lower().startswith(target)) cfd.plot()
def build_word_count(): if os.path.isfile('pickled/wcount.pickle'): return read_pickle('pickled/wcount.pickle') wcount = Counter() for fid in words.fileids(): for word in words.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in gutenberg.fileids(): for word in gutenberg.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in brown.fileids(): for word in brown.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in reuters.fileids(): for word in reuters.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in inaugural.fileids(): for word in inaugural.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 dump_pickle(wcount, 'pickled/wcount.pickle') return wcount
def cfd(text, tgt_list): from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in tgt_list if w.lower().startswith(target)) #cfd.plot() return cfd
def sent_length(): text_file = str(input("Enter the name of a text file : \n")) txt_fl = inaugural.sents(text_file) print(len(txt_fl)) file_name = inaugural.fileids() print(len(inaugural.sents(file_name)))
def main(): #Part 1: load inaugural addresses, tokenize, and serialize to pickle file #============================================= #get files names from nltk library file_ids = inaugural.fileids() #list to all hold tokenized addresses tokenized_addresses = [] #loop through all inaugural addresses for address in file_ids: #read the address into a string of newline separated sentences string = read_address(address) #tokenize each address into a list of lowercase words words = tokenize(string) #add address title to beginning of address words.insert(0, address) #append the tokenized address to the master list tokenized_addresses.append(words) #serialize list of addresses to pickle file with open('proj3.pkl', 'wb') as fout: pickle.dump(tokenized_addresses, fout)
def main(): s1=pre(inaugural.raw('2009-Obama.txt')) sx=inaugural.fileids() for file in sx: s2=pre(inaugural.raw(file)) #inter=set(s1) & set(s2) similarity1=similarity(s1,s2) print(similarity1,file)
def main(): cfd = nltk.ConditionalFreqDist( (target, file[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['democracy', 'republic'] if w.lower().startswith(target)) cfd.plot()
def get_sentences(): '''获得语料库中的句子,输出成sectence''' '''需要调用这个函数''' articles = inaugural.fileids() sentences = [] for i in articles: article = inaugural.sents(i) sentences = sentences + list(article) return sentences
def print_inaugural(): from nltk.corpus import inaugural cfd=nltk.ConditionalFreqDist( (target,file[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target) ) cfd.plot()
def build_inaugural_corpus(): """ Get a word token list for each doc in the inaugural address corpus :return: word_lists """ word_lists = [] for fileid in inaugural.fileids(): words = [w for w in inaugural.words(fileid)] word_lists.append(words) return word_lists
def get_inaugural_docs(download=False) -> List[List[List[str]]]: """ Get the inaugural documents as a list (documents) of list (sentences) of list (sentence) of strings (words) :param download: If True, the corpus will be downloaded. Default=False :return: """ if download: nltk.download('inaugural') return [[[w.lower() for w in sent] for sent in inaugural.sents(fileid)] for fileid in inaugural.fileids()]
def getFileids(): index = 0 for id in inaugural.fileids(): index += 1 if index == 2: print id # print the president name print len(inaugural.words(id)) #print the # of words if index == 14: print id # print the president name print len(inaugural.words(id)) #print the # of words
def main(): # @BEGIN normalize_list # @IN inaugural @URI file:data/inaugural/{year}-{president}.txt # @OUT normalized_addresses file_ids = inaugural.fileids() print(file_ids) normalized_addresses = [] for address in file_ids: normalized_words = [address.split("-")[0]] for sent in inaugural.sents(address): prev_word = "" for word in sent: if(prev_word == "'"): continue normalized = re.sub("[^a-z0-9]", "", word.lower()) if(normalized != ""): normalized_words.append(normalized) prev_word = word normalized_addresses.append(normalized_words) # @END normalized_list # @BEGIN pickleize # @IN normalized_addresses # @OUT pkl @URI file:data/norm_addresses.pkl fout = open("norm_addresses.pkl", "wb") pickle.dump(normalized_addresses, fout) fout.close() # @END pickleize # deserialize pkl file # @BEGIN depickleize # @IN pkl @URI file:data/norm_addresses.pkl # @OUT address_word_list fin = open("norm_addresses.pkl", "rb") address_word_list = pickle.load(fin) fin.close() # @END depickleize # @BEGIN frequency # @IN address_word_list # @IN search_word # @OUT frequency_maps search_word = input("Input word to find frequency: ") frequency_maps = {} for word_list in address_word_list: frequency_maps[word_list[0]] = calculate_frequency_map(word_list[1:]) # @END frequency generate_plot(search_word, frequency_maps)
def main(): list_of_addresses = [] for fileid in inaugural.fileids(): list_of_words = inaugural.words(fileid) string_of_words = ' '.join(list_of_words) alphabetic_words = re.findall(r"\w+", string_of_words) list_of_addresses.append(alphabetic_words) #print(list_of_addresses) fout = open('proj3.pkl', 'wb') pickle.dump(list_of_addresses, fout) fout.close()
def inaug20(): #Variables myinaug=inaugural.fileids() myaug20=[] #Function for x in range(len(myinaug)-4): #Goes through all ids, -4 for Obama (myaug20.append(myinaug[x:(x+5)])) #Create list from one president to five more #Return return myaug20
def tabulateWordsInPeriods(self, theWords): """ find the distribution of words within the years, based in Inaugural corpus @params theWords: the word/list of words to find info about """ cdf = ConditionalFreqDist((textid[:4], target) for textid in inaugural.fileids() for word in inaugural.words(textid) for target in theWords if word.lower().startswith(target) or word.lower().endswith(target)) cdf.tabulate()
def Get_Corpus(debug): print("Get_Corpus") # Inaugural is a list of lists. # Each row is one of 56 inaugural addresses. # Each row is a sequential list of the words and punctuation marks # in the speech. Inaugural = [] i = 0 for fileid in inaugural.fileids(): Inaugural.append(inaugural.words(fileid)) # for fileid in genesis.fileids(): # Inaugural.append(genesis.words(fileid)) print(i, fileid) debug.write("%d %s\n" % (i, fileid)) i += 1 Words = [] for speech in Inaugural: words = list(set(speech)) Words = list(set(words + Words)) Frequency = [] for word in Words: Frequency.append([0, 0, word]) for speech in Inaugural: for word in speech: i = Words.index(word) Frequency[i][1] += 1 S = list(set(speech)) for word in S: i = Words.index(word) Frequency[i][0] += 1 # Frequency = sorted(Frequency,key=lambda x:x[2], reverse=True) Frequency = sorted(Frequency, key=lambda x: x[1], reverse=True) Frequency = sorted(Frequency, key=lambda x: x[0], reverse=True) debug.write("\n\nSpeeches\n\n") for speech in Inaugural: for word in speech: debug.write("%s " % (word)) debug.write("\n\n") debug.write("\n\n") debug.write("\n\nFrequency\n\n") for row in Frequency: debug.write("%d %d %s\n" % (row[0], row[1], row[2])) debug.write("\n\n") return (Inaugural, Frequency)
def lexDiv(): y4 = [] x4 = [] for fileid in inaugural.fileids(): div = len(set(fileid)) / len(fileid) print(fileid[:4], "-", div) y4.append(fileid[:4]) x4.append(div) plt.title('Różnorodność słownictwa') plt.xticks(rotation=90) plt.plot(y4, x4) plt.show()
def avgWord(): x1 = [] y1 = [] for fileid in inaugural.fileids(): words = inaugural.raw(fileids=fileid) words = words.split() average = sum(len(word) for word in words) / len(words) print(fileid[:4], "-", average) y1.append(fileid[:4]) x1.append(average) plt.title('Średnia długość słowa:') plt.xticks(rotation=90) plt.plot(y1, x1) plt.show()
def avgSent(): x2 = [] y2 = [] for fileid in inaugural.fileids(): average = sum(len(sent) for sent in inaugural.sents(fileids=[fileid])) / len( inaugural.sents(fileids=[fileid])) print(fileid[:4], "-", average) y2.append(fileid[:4]) x2.append(average) plt.title('Średnia długość zdania:') plt.xticks(rotation=90) plt.plot(y2, x2) plt.show()
def main(): ##nltk.download('reuters') nltk.download('inaugural') nltk.download('punkt') docinaug=inaugural.fileids() documents = reuters.fileids() print(str(len(documents))) print(reuters.raw("test/15556")) forwardDict,backwardsDict,probMatrix,probUniMatrix,totalProb=tokenize(reuters.raw("test/15556")) ##print(documents[1]) ##print(docinaug[1]) #forwardDict,backwardDict,probMtrx=tokenize("the man. the man. the man") sent_token=word_tokenize("hello my friend how are you") print("a") print(sentence_perplex(inaugural.raw(docinaug[1]),probMatrix,forwardDict,probUniMatrix))
def getGraphs(): index = 0 for id in inaugural.fileids(): #prob(-14) index += 1 ww = inaugural.raw(id).lower() num_war = ww.count('war') num_america = ww.count('america') num_economy = ww.count('economy') num_world = ww.count('world') plot(index, num_war, 'mo') #war plot(index, num_america, 'go') #america (increasing) plot(index, num_economy, 'ro') #ecomony plot(index, num_world, 'bo') #world (increasing) xlabel('index, purple-war, green-america, red-economy, world-blue') ylabel('the frequency of the words used') show()
def CountWords(words): x = [] y = [] for fileid in inaugural.fileids(): count = 0 for w in inaugural.words(fileid): if w.lower() in words: count += 1 per = (count / len(inaugural.words(fileid))) * 100 y.append(fileid[:4]) x.append(per) plt.title('Liczba wystąpień:') plt.xticks(rotation=90) plt.plot(y, x) plt.show()
def fun2(): # 绘制分布图和分布表 # 条件是词america或citizen 2 ,绘图中的 # 计数是指在特定演讲中出现该词的次数。它利用了每个演讲的文件名——例如1865-Lincoln.txt——前4个字 # 符包含了年代信息的特 # 点 1 。这段代码为文件1865-Lincoln.txt中每个以america小写形式开头的词—— # 如:Americans——产生一个配对('america', '1865') from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) # 行, 列 for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target) # 某列中符合条件的单词计数 ) cfd.tabulate() cfd.plot()
def tabulate(): cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def tabulate(): cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)
def senti(): x3 = [] y3 = [] x31 = [] for fileid in inaugural.fileids(): text = inaugural.raw(fileids=fileid) senti = TextBlob(text) print(fileid[:4], "-", senti.sentiment) y3.append(fileid[:4]) x3.append(senti.sentiment[0]) x31.append(senti.sentiment[1]) plt.title('Polarity') plt.xticks(rotation=90) plt.plot(y3, x3) plt.show() plt.title('Subjectivity') plt.xticks(rotation=90) plt.plot(y3, x31) plt.show()
def graphWords(): index = 0 for id in inaugural.fileids(): index += 1 nchar = len(inaugural.raw(id)) * 1.0 nword = len(inaugural.words(id)) * 1.0 nsent = len(inaugural.sents(id)) * 1.0 nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0 a = nchar / nword b = nword / nsent c = nword / nvoc plot(index, a, 'mo') #purple color plot(index, b, 'go') #green color plot(index, c, 'ro') #red color xlabel( 'index, from Washington to Obama (purple - character/word), (red - word/vocab)' ) ylabel('Average numbers (green - word/sentence)') show()
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def __append_corpus_data(self): """ Appends data to the questions and statements files from the inaugural address corpus """ sentences = [] # Use the Presidential inaugural addresses corpus for fileid in inaugural.fileids(): raw_text = inaugural.raw(fileid) sentence_tokens = nltk.sent_tokenize(raw_text) sentences += sentence_tokens random.shuffle(sentences) random.shuffle(sentences) random.shuffle(sentences) # Write sentences to the sentences and questions files for sentence in sentences: if sentence and 10 < len(sentence) < 75: if sentence.endswith('?'): self.q_out.write(self.__strip_sentence(sentence) + '\n') else: self.s_out.write(self.__strip_sentence(sentence) + '\n')
def inaug(): myinaug=inaugural.fileids() myaug20=[] for x in range(len(myinaug)-4): (myaug20.append(myinaug[x:(x+5)])) return myaug20
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Dec 24 11:00:43 2017 @author: Mohnish_Devadiga """ import nltk from nltk.corpus import inaugural import pandas as pd import matplotlib inaugural.fileids() #print(inaugural.fileids()) for speech in inaugural.fileids(): word_count_total = len(inaugural.words(speech)) print(speech , word_count_total) #Go through all speech speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()] print(speech_length) #Get the max and min speech print("Max is : ",max(speech_length)) print("Min is : ",min(speech_length)) #Avg no of words per sentence for each speech
def main(): #store FreqDist's #index is the length of the word, 0 is for all words samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" brown_letters = FreqDist() web_letters = FreqDist() inaugural_letters = FreqDist() gutenberg_letters = FreqDist() genesis_letters = FreqDist() for file in gutenberg.fileids(): for word in gutenberg.words(file): for character in word: if(character in string.letters): gutenberg_letters[character.upper()] += 1 for file in brown.fileids(): for word in brown.words(file): for character in word: if(character in string.letters): brown_letters[character.upper()] += 1 for file in webtext.fileids(): for word in webtext.words(file): for character in word: if(character in string.letters): web_letters[character.upper()] += 1 for file in inaugural.fileids(): for word in inaugural.words(file): for character in word: if(character in string.letters): inaugural_letters[character.upper()] += 1 for file in genesis.fileids(): for word in genesis.words(file): for character in word: if(character in string.letters): genesis_letters[character.upper()] += 1 with open("genesis-letter-freq.txt",'w') as f: sys.stdout = f f.write("GENESIS\n") for let in samples: print(str(genesis_letters[let])) with open("gutenberg-letter-freq.txt", 'w') as f: sys.stdout = f f.write("GUTENBERG\n") for let in samples: print(str(gutenberg_letters[let])) with open("webtext-letter-freq.txt", 'w') as f: sys.stdout = f f.write("WEBTEXT\n") for let in samples: print(str(web_letters[let])) with open("inaugural-letter-freq.txt", 'w') as f: sys.stdout = f f.write("INAUGURAL\n") for let in samples: print(str(inaugural_letters[let])) with open("brown-letter-freq.txt", 'w') as f: sys.stdout = f f.write("BROWN\n") for let in samples: print(str(brown_letters[let])) with open("letter-freq.txt", 'w') as f: corpora = [gutenberg_letters, web_letters, inaugural_letters, brown_letters, genesis_letters] f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n") for let in samples: for corpus in corpora: f.write(str(corpus[let]) + ",") f.write("\n")
print(reuters.words('training/9865')[:14]) print(reuters.words(['training/9865', 'training/9880'])) print(reuters.words(categories='barley')) print(reuters.words(categories=['barley', 'com'])) print("-" * 40) print(""" ---------------------------------------------------------------------- 1.5 Inaugural Address Corpus (any key to continue)""") raw_input() print("-" * 40) from nltk.corpus import inaugural print(inaugural.fileids()) print([fileid[:4] for fileid in inaugural.fileids()]) print("-" * 40) cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() print("-" * 40) print(""" ----------------------------------------------------------------------
def answers(): _rvals = [] #### Question 1 #### print '##### Question 1 #####' print '(see code - lines 64-65)' print '(NB: the two variables are returned by this function)' _bush01 = inaugural.words('2001-Bush.txt') bush01_word_lengths = _lengths(_vocabulary(_bush01)) fd_bush01_words = FreqDist(_nopunct(_bush01)) _rvals.append(bush01_word_lengths) _rvals.append(fd_bush01_words) #### Question 2 #### print '\n##### Question 2 #####' bush01_top10_words = _firsts(fd_bush01_words.items()[:10]) bush01_average_word_lengths = _avg(bush01_word_lengths) _obama09 = inaugural.words('2009-Obama.txt') _fd_obama09_words = FreqDist(_nopunct(_obama09)) _obama09_word_lengths = _lengths(_vocabulary(_obama09)) obama09_top10_words = _firsts(_fd_obama09_words.items()[:10]) obama09_average_word_lengths = _avg(_obama09_word_lengths) print 'top10 words Bush (2001): ', _str(bush01_top10_words) print 'top10 words Obama (2009):', _str(obama09_top10_words) print 'average word length Bush (2001): ', bush01_average_word_lengths print 'average word length Obama (2009):', obama09_average_word_lengths #### Question 3 #### print '\n##### Question 3 #####' bush01_token_lengths = _avg(_lengths(_nopunct(_bush01))) obama09_token_lengths = _avg(_lengths(_nopunct(_obama09))) print 'average token length Bush (2001): ', bush01_token_lengths print 'average token length Obama (2009):', obama09_token_lengths #### Question 4 #### print '\n##### Question 4 #####' for _fileid in inaugural.fileids(): _year = int(_fileid.split('-')[0]) _vocab_size = number_of_word_types(_fileid) print 'year %d: %d word types' % (_year, _vocab_size) #### Question 5 #### print '\n##### Question 5 #####' fd_bush01_nostop = FreqDist(_nostops(_nopunct(_bush01))) fd_obama09_nostop = FreqDist(_nostops(_nopunct(_obama09))) bush01_top10_nostop = _firsts(fd_bush01_nostop.items()[:10]) obama09_top10_nostop = _firsts(fd_obama09_nostop.items()[:10]) print 'top10 non-stop-words Bush (2001): ', _str(bush01_top10_nostop) print 'top10 non-stop-words Obama (2009):', _str(obama09_top10_nostop) #### Question 6 #### print '\n##### Question 6 #####' _wash89 = inaugural.words('1789-Washington.txt') fd_wash89_nostop = FreqDist(_nostops(_nopunct(_wash89))) wash89_top10_nostop = _firsts(fd_wash89_nostop.items()[:10]) print 'top10 non-stop-words Washington (1789):', _str(wash89_top10_nostop) #### Question 7 #### print '\n##### Question 7 #####' wash89_rank_country = rank(fd_wash89_nostop, 'country') obama09_rank_country = rank(fd_obama09_nostop, 'country') bush01_rank_country = rank(fd_bush01_nostop, 'country') print 'rank of "country" in Washington (1789):', wash89_rank_country print 'rank of "country" in Obama (2009):', obama09_rank_country print 'rank of "country" in Bush (2001):', bush01_rank_country #### Question 8 #### print '\n##### Question 7 #####' print '(see comments in "rank" function on lines 20-45)' #### Question 9 #### print '\n##### Question 9 #####' print '(see plot)' ff = inaugural.fileids() fdd = {} _years = [] for _fileid in ff: fdd[_fileid] = FreqDist(_nostops(inaugural.words(_fileid))) _years.append(_fileid[0:4]) pylab.plot([(lambda d: len(d) / float(d.N()))(fdd[f]) for f in ff]) pylab.xticks(range(len(ff)), _years, rotation=90) pylab.xlim(0, len(ff) - 1) pylab.ylabel('ratio of word types to tokens (without stop-words)') pylab.xlabel('time') pylab.title('f(time) = #(word types) / #(word tokens)') pylab.show() #### Question 10 #### print '\n##### Question 10 #####' print '(see plot)' obama09top10_butnot_wash89top10 = [word for word in obama09_top10_nostop if word in fd_wash89_nostop and word not in wash89_top10_nostop] wash89top10_butnot_obama09top10 = [word for word in wash89_top10_nostop if word in fd_obama09_nostop and word not in obama09_top10_nostop] obama09_word = 'world' wash89_word = 'government' assert(wash89_word in wash89top10_butnot_obama09top10) assert(obama09_word in obama09top10_butnot_wash89top10) normalisation_justification = (\ "We normalise for different sizes in vocabulary by dividing the rank of " "some word by the size of the vocabulary in that speech" "Since rank is in relation with vocabulary size, this is similar to " "getting the maximum rank over all speeches and dividing each rank by that " "quantity") print normalisation_justification _normalised_rank = lambda f, w: min(1, rank(fdd[f], w) / \ float(len(_vocabulary(_nostops(fdd[f]))))) pylab.plot([_normalised_rank(f, obama09_word) for f in ff], label=obama09_word, color='b') pylab.plot([_normalised_rank(f, wash89_word) for f in ff], label=wash89_word, color='r') pylab.xticks(range(len(ff)), _years, rotation=90) pylab.xlim(0, len(ff) - 1) pylab.ylabel('normalised word rank (lower is better)') pylab.xlabel('time') pylab.title('f(time) = word rank / vocabulary size') pylab.legend() pylab.show() #### Question 11 #### print '\n##### Question 11 #####' observations_on_plots = (\ "We observe that the rank of 'world' is noisy when observed on the level " "of some individual year/inaugural speech. However, when looking at the " "larger picture, a trend emerges: 'world''s rank is consistenlty getting " "higher over time - an indicator for an ever-globalising and shrinking " "world?" "\n" "We observe that 'government' is a consistently highly ranked word across " "time - expcept for some few inaugural speeches where it has a very low " "rank. Those speeches are around the early 1800s (abolishment of slavery)," " 1860s-70s (US civil war), the early 1900s (Word War One), and 1937-1981" "(World War Two + Cold War) - it would seem that presidents don't want to " "remind their subjugates of the government during hard times. Outliers to " "this theory can be explained easily (e.g. somewhat high rank of " "'government' in 1949 = a certain 'evil government' being defeated).") print observations_on_plots return _rvals
reuters.words('training/9865')[:14] # [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', u'BIDS', # u'DETAILED', u'French', u'operators', u'have', u'requested', u'licences', #u'to', u'export'] reuters.words(['training/9865', 'training/9880']) # out: [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...] reuters.words(categories='barley') # [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...] reuters.words(categories=['barley', 'corn']) # [u'THAI', u'TRADE', u'DEFICIT', u'WIDENS', u'IN', ...] # INAUGURAL ADDRESS corpus from nltk.corpus import inaugural inaugural.fileids() # out: [u'1789-Washington.txt', u'1793-Washington.txt', u'1797-Adams.txt', u'1801-Jefferson.txt', u'1805-Jefferson.txt', u'1809-Madison.txt'... # grab the first 4 chars of the fileids to grab the years [fileid[:4] for fileid in inaugural.fileids()] # out: [u'1789', u'1793', u'1797', u'1801', u'1805',... cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) #convert corpus to lowercase then check whether they start with either the targets america or citizen # requires matplotlib cfd.plot() # ANNOTATED TEXT CORPORA # Loading your own corpus # see Pathology project: need to add pathology report text to txt files
from nltk.corpus import reuters reuters.fileids() reuters.categories() reuters.categories('training/9865') reuters.categories(['training/9865', 'training/9880']) reuters.fileids('barley') reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories = 'barley') reuters.words(categories = ['barley', 'corn']) from nltk.corpus import inaugural inaugural.fileids() [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() nltk.corpus.cess_esp.words() nltk.corpus.floresta.words() #Error nltk.corpus.indian.words('hindi.pos') nltk.corpus.udhr.fileids()
(genre, word) for genre in brown.categories() for word in brown.words(categories = genre)) genres = ['news','religion','hobbies','science_fiction','romance','humor'] modals = ['can','could','may','might','must','will'] cfd.tabulate(conditions = genres, samples = modals) from nltk.corpus import reuters reuters.fileids() reuters.categories(['training/9865', 'training/8666']) reuters.fileids(['barley','corn']) reuters.words('training/9865')[:14] reuters.words(categories = ['corn','barley']) from nltk.corpus import inaugural inaugural.fileids() inaugYears = [fileid[:4] for fileid in inaugural.fileids()] cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target)) cfd.plot() from nltk.corpus import udhr languages = ['English','Finnish_Suomi','Italian_Italiano', 'Greenlandic_Inuktikut'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages
from nltk.corpus import inaugural as inag from nltk import ConditionalFreqDist as CondFreqDist cfd = CondFreqDist([(target , fileid[:4])\ for fileid in inag.fileids() \ for word in inag.words(fileid) \ for target in ["wealth" , "peace" , "harmony" , "prosperous"] if word.lower().startswith(target) ]) cfd.plot()
sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7]) # Collocations and Bigrams. # A collocation is a sequence of words that occur together unusually often. # Built in collocations function text4.collocations() ############# #Corpus data# ############# # Inaugural Address Corpus from nltk.corpus import inaugural inaugural.fileids()[:2] [fileid[:4] for fileid in inaugural.fileids()] #How the words America and citizen are used over time. cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'war'] if w.lower().startswith(target)) cfd.plot() #cfd.tabulate() from nltk.corpus import brown news_words=brown.words(categories="news")
genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(len(genre_word)) # 170576 个词类 print(genre_word[:4]) # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')] # [_start-genre] print(genre_word[-4:]) # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')] # [_end-genre] cfd = ConditionalFreqDist(genre_word) print(cfd) # <ConditionalFreqDist with 2 conditions> print(cfd.conditions()) # ['news', 'romance'] # [_conditions-cfd] print(cfd['news']) # <FreqDist with 14394 samples and 100554 outcomes> print(cfd['romance']) # <FreqDist with 8452 samples and 70022 outcomes> print(cfd['romance'].most_common(2)) # [(',', 3899), ('.', 3736)] print(cfd['romance']['could']) # 193 print(cfd['romance'].max()) # 找到 romance 中最大的 print(cfd['romance'][',']) # 3899 ################################################################## ## plot() how the words America and citizen are used over time; 美国总统就职演讲, 使用 America 和 citizen 情况 cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for word in inaugural.words(fileid) for target in ['america', 'citizen'] if word.lower().startswith(target)) cfd.plot() # 绘制演讲中出现 America 和 citizen 次数 ################################################################## ## tabulate(); 提取词对 # Next, let's combine regular expressions with conditional frequency distributions. # Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair, # it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair: rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] print(cvs[:10]) # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko'] cfd = ConditionalFreqDist(cvs) cfd.tabulate() # a e i o u # k 418 148 94 420 173 # p 83 31 105 34 51 # r 187 63 84 89 79
def fun11(): """inaugural address corpus""" print inaugural.fileids() print [fileid[:4] for fileid in inaugural.fileids()]
__author__ = 'auroua' from nltk.corpus import inaugural from nltk.corpus import stopwords import numpy as np import matplotlib.pyplot as plt from lda_1 import LDA import seaborn as sns stops = set(stopwords.words("english")) vocab = dict() for fileid in inaugural.fileids(): for word in inaugural.words(fileid): word = word.lower() if word not in stops and word.isalpha(): if word not in vocab: vocab[word] = 0 vocab[word] += 1 """ Sort the vocab keep only words which occur more than 50 times Then Create word to id and id to word dictionaries """ vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True)) wordids = {v[0]: i for i, v in enumerate(vocab_sorted)} idwords = {i: v[0] for i, v in enumerate(vocab_sorted)} vocab_size = len(wordids) print vocab_size # Generate corpus document vectors data = []
import numpy as np import csv from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize.regexp import RegexpTokenizer from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk import pos_tag os.chdir("C:/Users/Charles/Desktop/Inaugural/") #LOAD DATASETS from nltk.corpus import inaugural titles = inaugural.fileids() addresses = [] for title in titles: f = inaugural.open(title) text = f.read().encode('UTF-8') addresses.append(text) Pstem = PorterStemmer() WNL = WordNetLemmatizer() def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.lemmatize(item)) return stemmed
def exercise_inaugural(): print inaugural.fileids() # 提取每个演讲文本的年代名 print [file_id[:4] for file_id in inaugural.fileids()]
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import inaugural ################################################################## ## 简单了解 print(type(inaugural)) # <class 'nltk.corpus.reader.plaintext.PlaintextCorpusReader'> print(len(inaugural.fileids())) # 56; 到 奥巴马 为止一共 56 个总统 print(inaugural.fileids()[:3]) # ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt'] ################################################################## ## 输出美国总统的就职年份 print([fileid[:4] for fileid in inaugural.fileids()]) # ['1789', '1793', '1797', '1801', '1805', '1809', '1813', '1817', '1821', ...]
#路透语料库 from nltk.corpus import reuters reuters.fileids() reuters.categories() reuters.categories(['training/9865', 'training/9880']) reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories=['barley', 'corn']) #演说语料库 from nltk.corpus import inaugural inaugural.fileids() #多国世界人权宣言 from nltk.corpus import udhr languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative = True) cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True) #条件频率分布 genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] cfd = nltk.ConditionalFreqDist(genre_word)
# Note that there are no optional tasks in this section, because familiarizing yourself with the NLTK # is important even if you are an experienced programmer. Please complete the tasks in the boxes below. # ======================= Learning how to use NLTK - Task 1 ============================ # Write all the Python code below in a new file called corpuses.py and make sure you understand it! # You must work in your Dropbox folder so we can see your progress. # Run your file everytime something new is added so you can see how it works. # There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py # === Part 1: Importing Corpuses === import nltk from nltk.corpus import inaugural print inaugural.fileids() # Run your file.You should see all the text files containing all the speeches of the US presidents that the # NLTK has saved inside it. # Now add the lines: print "=============Words in Obama's Speech ======" print inaugural.words("2009-Obama.txt") # Returns a list of all the words in Obama's speech print "=============Words in Bush's speech ======" print inaugural.sents("2005-Bush.txt") # Returns a list of all the sentences in Bush's speech # As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech. # Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech. # === Part 2: Analysing tokens (words) of a text ===
from nltk.probability import FreqDist from nltk.corpus import inaugural, stopwords import string import json from pprint import pprint import math import networkx as nx filenames = inaugural.fileids() def dump_content(filename, content): j = json.dumps(content, indent=4) f = open(filename+'.json', 'w') print >> f, j f.close() def read_content(filename): json_data=open(filename+'.json') content = json.load(json_data) json_data.close() return content def remove_punctuation(text): content = [w.strip(string.punctuation) for w in text] return content def remove_stopwords(text): content = [w for w in text if w.lower() not in stopwords.words('english')] return content def clean(text):
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")