def main(): hound = text_to_string('hound.txt') tokens = nltk.word_tokenize(hound) texts = Text(tokens) plt.ion() plt.figure(figsize=(12, 9)) targets = [ 'Holmes', 'Watson', 'Mortimer', 'Henry', 'Barrymore', 'Stapleton', 'Seldon', 'hound' ] dispersion_plot(texts, targets, ignore_case=True, title='Lexical Dispersion Plot') plt.show(block=True)
def plot(): import nltk import matplotlib.pyplot as plt f = open("egghunt.txt") raw = f.read() tokens = nltk.word_tokenize(raw) text = nltk.Text(tokens) from nltk.draw.dispersion import dispersion_plot plt.figure(figsize=(20, 3)) targets = ['creative', "egg", "hunt", 'happy', 'easter', 'yall'] dispersion_plot(text, targets, ignore_case=True, title='Lexical Dispersion Plot')
# explore other concordances in different texts print(text6.concordance('rabbit')) print(lb1) print(text3.concordance('lived')) print(lb2) print(text5.concordance('f**k')) # print similarities to monstrous in Moby Dick and S & S print(text1.similar('monstrous')) print(lb1) print(text2.similar('monstrous')) print(lb2) # print common contexts for very and monstrous in S & S print(text2.common_contexts(['monstrous', 'very'])) print(lb1) # explore similar and common_contexts print(text6.similar('king')) print(lb2) print(text3.similar('king')) print(lb1) print(text5.common_contexts(['love', 'f**k'])) # print dispersion plot dispersion_plot(text4, ['citizens', 'democracy', 'freedom', 'duties', 'America']) # expolore dispersion_plot dispersion_plot(text6, ['spam', 'king', 'grail', 'rabbit', 'ni'])
def wordhomogenity(): dispersion_plot(data2.Translated_Review,["good","awesome","usefull","love", "brilliant","great","amazing","best"])
import nltk from nltk.draw.dispersion import dispersion_plot import matplotlib ww = open("Moana.txt", "r") raw = ww.read() # process pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' tokens = nltk.regexp_tokenize(raw, pattern) lctokens = [w.lower() for w in tokens] # ordered by frequency from collections import Counter c = Counter(tokens) c.most_common(100) #plot dispersion_plot(lctokens, [ 'maui', 'hook', 'heart', 'te', 'fiti', 'ka', 'ocean', 'island', 'reef', 'moana', 'boat', 'gramma', 'tattoo', 'cheeeehoooo', 'crab', 'shiny' ])
from nltk.draw.dispersion import dispersion_plot #ww = open("Wonder.Woman.2017.txt","r") ww = open("Black.Panther.dialogue.txt","r") raw = ww.read() # process pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' tokens = nltk.regexp_tokenize(raw,pattern) tokens = nltk.word_tokenize(raw) lctokens = [w.lower() for w in tokens] # ordered by frequency from collections import Counter c = Counter(lctokens) for (i,j) in c.most_common(300): if len(i) > 4: print (i,j) dispersion_plot(lctokens,['wakanda','oakland','korea','t\'challa','black','panther','nakia', 'shuri', 'okoye', 'klaue', 'jabari','n\'jobu','killmonger','ancestors','lab','challenge','herb', 'vibranium','weapons','freeze','beads','suit','heal','technology'])
# 1.3 Searching Text text1.concordance('monstrous') text1.concordance('live') # Similar words, these are like synonym but derived from the context. text1.similar('monstrous') text2.similar('monstrous') text1.similar('live') # We can also obtain the shared context between multiple phrases. text2.common_contexts(["monstrous", "very"]) # Dispersion plot, this plot displayes the location of occurence within a text. from nltk.draw.dispersion import dispersion_plot dispersion_plot(text4, ["citizens", "democracy", "freedom", "duties", "America"]) # We can also generate text based on the article. # # NOTE(Michael): This feature does not work in NLTK 3.0. # text3.generate() # 1.4 Counting Volcabulary len(text3) # We can also obtain the unique words (or token) in the text. # # Esentially, what we are doing is convert the text to a set, which # contains only unique entries. sorted(set(text3))
ww = open("Labyrinth.txt", "r") raw = ww.read() # process pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' tokens = nltk.regexp_tokenize(raw, pattern) #tokens = nltk.word_tokenize(raw) # squash capitalization to combine words at beginning of sentences with others lctokens = [w.lower() for w in tokens] # ordered by frequency from collections import Counter c = Counter(tokens) c.most_common(100) #plot dispersion_plot(lctokens, [ 'ludo', 'hoggle', 'labyrinth', 'sarah', 'ambrosius', 'castle', 'toby', 'baby', 'friend', 'magic', 'solve', 'goblin', 'fair' ])
''' tokens = nltk.regexp_tokenize(raw, pattern) tokens = nltk.word_tokenize(raw) lctokens = [w.lower() for w in tokens] # ordered by frequency from collections import Counter c = Counter(text6) c.most_common(50) #plot dispersion_plot(lctokens, [ 'diana', 'wonder', 'woman', 'hippolyta', 'zeus', 'steve', 'ares', 'war', 'men', 'evil', 'ludendorff', 'german', 'gas', 'love', 'ice', 'cream', 'sleep', 'dance' ]) # Who is WW really about? dispersion_plot(lctokens, [ 'diana', 'wonder', 'woman', 'hippolyta', 'zeus', 'steve', 'ares', 'ludendorff', 'german',
plt.axis('off') plt.show() import nltk from nltk.draw.dispersion import dispersion_plot from nltk.corpus import stopwords stopwords = stopwords.words('english') wordlist = nltk.word_tokenize(text) topics = ['government','country','states','citizen','power'] dispersion_plot(wordlist,topics) wordlist = [x.lower() for x in nltk.word_tokenize(text) if x.lower() not in stopwords and x.isalpha()] freq = nltk.FreqDist(wordlist) plt.figure(figsize=(12,12)) freq.plot(50) # lexical diversity # text is a list of words def ld(text): return len(set(text))/len(text)
# -*-coding:UTF-8-*- import codecs import nltk #import numpy, matplotlib from nltk.draw.dispersion import dispersion_plot text = codecs.open('quranic.txt', 'r', 'utf-8') text4 = text.read() text.close() #text4=nltk.Text(text4) text4 = text4.split() #text4=u"بسم الله الرحمن الرحيم" dispersion_plot(text4, ["LEM:{ll~ah"])
length = 0 print('Total usage of words: ', total, '\n') print('Usage of the word men: ', men, '\n') print('Usage of the word women: ', women, '\n') print('Usage of the word people: ', people, '\n') #Extra: Producing a dispersion plot to see the usages of the words over time cfd = nltk.ConditionalFreqDist( (genre, word) for genre in su.fileids() for word in su.words(genre) if word.lower() == 'women' or word.lower() == 'men' or word.lower() == 'people') from nltk.draw.dispersion import dispersion_plot words_watch = ['men', 'women', 'people'] dispersion_plot(su.words(), words_watch, ignore_case=True) #Number 2 (2.13) in HW3 print('################ Number 2 ################') from nltk.corpus import wordnet as wn nnw = wn.all_synsets('n') nouns = list(nnw) print('Length of synsets', len(nouns)) yes = [] for w in nouns: if len(w.hyponyms()) != 0: yes.append(w) print('Length of synsets with hyponyms', len(yes)) print('Percentage of noun synsets with no hypnoyms',
# Show tf-idf feature matrix tfidf.get_feature_names() # Create data frame #pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names()) # ## 2. Lexical dispersion plot # This is the plot of a word vs the offset of the word in the text corpus.The y-axis represents the word. Each word has a strip representing entire text in terms of offset, and a mark on the strip indicates the occurrence of the word at that offset, a strip is an x-axis. The positional information can indicate the focus of discussion in the text. topics = ['projection', 'federal', 'percent', 'tealbook', 'economic'] from nltk.draw.dispersion import dispersion_plot dispersion_plot(allwords, topics) # ## 3. Frequency distribution plot import nltk from nltk.probability import FreqDist fqdist = FreqDist(allwords) freqdist = nltk.FreqDist(allwords) plt.figure(figsize=(16, 5)) freqdist.plot(50) # Most Frequent 10 words in all Text freqdist.most_common(10)
from nltk.book import * import nltk # problem 1 from nltk.draw.dispersion import dispersion_plot p1_words = ['Elinor', 'Marianne', 'Edward', 'Willoughby'] dispersion_plot(text2, p1_words) # problem 2 V = set(text5) wordsBeginWithT = [w for w in V if (len(w) == 5) and ((w[0] == 't') or (w[0] == 'T'))] print(sorted(wordsBeginWithT)) from nltk import FreqDist fdist = FreqDist(w for w in text5 if len(w) == 5) print(fdist.most_common()) # problem 3 lista = sorted(w for w in set(text2) if w.endswith('er')) listb = sorted(w for w in set(text2) if 'm' in w) listc = sorted(w for w in set(text2) if 'ph' in w) listd = sorted(w for w in set(text2) if w.istitle()) listall = lista + listb + listc + listd print(listall)
text1.concordance("monstrous") # What other words appear in a similar range of contexts? text1.similar("monstrous") # Examine contexts shared by two or more words text2.common_contexts(["monstrous", "very"]) # Determine location of a word in the context, or how many words from the beginning it appears # Note: As of November 13, 2016, the book's example code does not work. # http://stackoverflow.com/questions/25182140/dispersion-plot-not-working-inspite-of-installing-matplotlib from nltk.draw.dispersion import dispersion_plot words = ["citizens", "democracy", "freedom", "duties", "America"] dispersion_plot(text4, words) # Find out the length of a text, in terms of words and punctuation len(text3) # len() returns a number of tokens, or a sequence of characters to be treated as a group # Obtain vocabulary of a text, or the set of tokens used sorted(set(text3)) len(set(text3)) # Compute lexical richness defined as percentage of unique words len(set(text3)) / len(text3) # Count how often a word occurs