示例#1
0
def main():
    hound = text_to_string('hound.txt')
    tokens = nltk.word_tokenize(hound)
    texts = Text(tokens)
    plt.ion()
    plt.figure(figsize=(12, 9))
    targets = [
        'Holmes', 'Watson', 'Mortimer', 'Henry', 'Barrymore', 'Stapleton',
        'Seldon', 'hound'
    ]
    dispersion_plot(texts,
                    targets,
                    ignore_case=True,
                    title='Lexical Dispersion Plot')
    plt.show(block=True)
示例#2
0
def plot():
    import nltk
    import matplotlib.pyplot as plt
    f = open("egghunt.txt")
    raw = f.read()
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)

    from nltk.draw.dispersion import dispersion_plot
    plt.figure(figsize=(20, 3))
    targets = ['creative', "egg", "hunt", 'happy', 'easter', 'yall']

    dispersion_plot(text,
                    targets,
                    ignore_case=True,
                    title='Lexical Dispersion Plot')
示例#3
0
# explore other concordances in different texts
print(text6.concordance('rabbit'))
print(lb1)
print(text3.concordance('lived'))
print(lb2)
print(text5.concordance('f**k'))

# print similarities to monstrous in Moby Dick and S & S
print(text1.similar('monstrous'))
print(lb1)
print(text2.similar('monstrous'))
print(lb2)

# print common contexts for very and monstrous in S & S
print(text2.common_contexts(['monstrous', 'very']))
print(lb1)

# explore similar and common_contexts
print(text6.similar('king'))
print(lb2)
print(text3.similar('king'))
print(lb1)
print(text5.common_contexts(['love', 'f**k']))

# print dispersion plot
dispersion_plot(text4,
                ['citizens', 'democracy', 'freedom', 'duties', 'America'])

# expolore dispersion_plot
dispersion_plot(text6, ['spam', 'king', 'grail', 'rabbit', 'ni'])
 def wordhomogenity():
     dispersion_plot(data2.Translated_Review,["good","awesome","usefull","love",
                                              "brilliant","great","amazing","best"])
示例#5
0
import nltk
from nltk.draw.dispersion import dispersion_plot
import matplotlib

ww = open("Moana.txt", "r")
raw = ww.read()

# process
pattern = r'''(?x)    # set flag to allow verbose regexps
    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
  | \w+(-\w+)*        # words with optional internal hyphens
  | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
  | \.\.\.            # ellipsis
  | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
  '''

tokens = nltk.regexp_tokenize(raw, pattern)

lctokens = [w.lower() for w in tokens]

# ordered by frequency
from collections import Counter
c = Counter(tokens)
c.most_common(100)

#plot
dispersion_plot(lctokens, [
    'maui', 'hook', 'heart', 'te', 'fiti', 'ka', 'ocean', 'island', 'reef',
    'moana', 'boat', 'gramma', 'tattoo', 'cheeeehoooo', 'crab', 'shiny'
])
示例#6
0
from nltk.draw.dispersion import dispersion_plot

#ww = open("Wonder.Woman.2017.txt","r")
ww = open("Black.Panther.dialogue.txt","r")
raw = ww.read()

# process
pattern = r'''(?x)    # set flag to allow verbose regexps
    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
  | \w+(-\w+)*        # words with optional internal hyphens
  | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
  | \.\.\.            # ellipsis
  | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
  '''

tokens = nltk.regexp_tokenize(raw,pattern)

tokens = nltk.word_tokenize(raw)
lctokens = [w.lower() for w in tokens]

# ordered by frequency
from collections import Counter
c = Counter(lctokens)
for (i,j) in c.most_common(300):
  if len(i) > 4:
    print (i,j)

dispersion_plot(lctokens,['wakanda','oakland','korea','t\'challa','black','panther','nakia', 'shuri', 'okoye', 'klaue', 
  'jabari','n\'jobu','killmonger','ancestors','lab','challenge','herb',
  'vibranium','weapons','freeze','beads','suit','heal','technology'])
示例#7
0
# 1.3 Searching Text
text1.concordance('monstrous')
text1.concordance('live')

# Similar words, these are like synonym but derived from the context.
text1.similar('monstrous')
text2.similar('monstrous')
text1.similar('live')

# We can also obtain the shared context between multiple phrases.
text2.common_contexts(["monstrous", "very"])

# Dispersion plot, this plot displayes the location of occurence within a text.
from nltk.draw.dispersion import dispersion_plot

dispersion_plot(text4,
                ["citizens", "democracy", "freedom", "duties", "America"])

# We can also generate text based on the article.
#
# NOTE(Michael): This feature does not work in NLTK 3.0.
# text3.generate()

# 1.4 Counting Volcabulary
len(text3)

# We can also obtain the unique words (or token) in the text.
#
# Esentially, what we are doing is convert the text to a set, which
# contains only unique entries.
sorted(set(text3))
示例#8
0
ww = open("Labyrinth.txt", "r")
raw = ww.read()

# process
pattern = r'''(?x)    # set flag to allow verbose regexps
    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
  | \w+(-\w+)*        # words with optional internal hyphens
  | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
  | \.\.\.            # ellipsis
  | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
  '''

tokens = nltk.regexp_tokenize(raw, pattern)
#tokens = nltk.word_tokenize(raw)

# squash capitalization to combine words at beginning of sentences with others
lctokens = [w.lower() for w in tokens]

# ordered by frequency
from collections import Counter

c = Counter(tokens)
c.most_common(100)

#plot
dispersion_plot(lctokens, [
    'ludo', 'hoggle', 'labyrinth', 'sarah', 'ambrosius', 'castle', 'toby',
    'baby', 'friend', 'magic', 'solve', 'goblin', 'fair'
])
示例#9
0
  '''

tokens = nltk.regexp_tokenize(raw, pattern)

tokens = nltk.word_tokenize(raw)
lctokens = [w.lower() for w in tokens]

# ordered by frequency
from collections import Counter
c = Counter(text6)
c.most_common(50)

#plot
dispersion_plot(lctokens, [
    'diana', 'wonder', 'woman', 'hippolyta', 'zeus', 'steve', 'ares', 'war',
    'men', 'evil', 'ludendorff', 'german', 'gas', 'love', 'ice', 'cream',
    'sleep', 'dance'
])

# Who is WW really about?

dispersion_plot(lctokens, [
    'diana',
    'wonder',
    'woman',
    'hippolyta',
    'zeus',
    'steve',
    'ares',
    'ludendorff',
    'german',
示例#10
0
plt.axis('off')
plt.show()




import nltk
from nltk.draw.dispersion import dispersion_plot
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

wordlist = nltk.word_tokenize(text)

topics = ['government','country','states','citizen','power']
dispersion_plot(wordlist,topics)


wordlist = [x.lower() for x in nltk.word_tokenize(text) if x.lower() not in stopwords and x.isalpha()]


freq = nltk.FreqDist(wordlist)
plt.figure(figsize=(12,12))
freq.plot(50)


# lexical diversity 
# text is a list of words
def ld(text):  
    return len(set(text))/len(text)
示例#11
0
# -*-coding:UTF-8-*-
import codecs
import nltk
#import numpy, matplotlib
from nltk.draw.dispersion import dispersion_plot

text = codecs.open('quranic.txt', 'r', 'utf-8')
text4 = text.read()
text.close()
#text4=nltk.Text(text4)
text4 = text4.split()
#text4=u"بسم الله الرحمن الرحيم"
dispersion_plot(text4, ["LEM:{ll~ah"])
示例#12
0
    length = 0
print('Total usage of words: ', total, '\n')
print('Usage of the word men: ', men, '\n')
print('Usage of the word women: ', women, '\n')
print('Usage of the word people: ', people, '\n')

#Extra: Producing a dispersion plot to see the usages of the words over time

cfd = nltk.ConditionalFreqDist(
    (genre, word) for genre in su.fileids() for word in su.words(genre)
    if word.lower() == 'women' or word.lower() == 'men'
    or word.lower() == 'people')

from nltk.draw.dispersion import dispersion_plot
words_watch = ['men', 'women', 'people']
dispersion_plot(su.words(), words_watch, ignore_case=True)

#Number 2 (2.13) in HW3

print('################ Number 2 ################')

from nltk.corpus import wordnet as wn
nnw = wn.all_synsets('n')
nouns = list(nnw)
print('Length of synsets', len(nouns))
yes = []
for w in nouns:
    if len(w.hyponyms()) != 0:
        yes.append(w)
print('Length of synsets with hyponyms', len(yes))
print('Percentage of noun synsets with no hypnoyms',
# Show tf-idf feature matrix
tfidf.get_feature_names()

# Create data frame
#pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())

# ## 2. Lexical dispersion plot

# This is the plot of a word vs the offset of the word in the text corpus.The y-axis represents the word. Each word has a strip representing entire text in terms of offset, and a mark on the strip indicates the occurrence of the word at that offset, a strip is an x-axis. The positional information can indicate the focus of discussion in the text.

topics = ['projection', 'federal', 'percent', 'tealbook', 'economic']

from nltk.draw.dispersion import dispersion_plot

dispersion_plot(allwords, topics)

# ## 3. Frequency distribution plot

import nltk
from nltk.probability import FreqDist

fqdist = FreqDist(allwords)

freqdist = nltk.FreqDist(allwords)
plt.figure(figsize=(16, 5))
freqdist.plot(50)

# Most Frequent 10 words in all Text

freqdist.most_common(10)
from nltk.book import *
import nltk



# problem 1

from nltk.draw.dispersion import dispersion_plot
p1_words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
dispersion_plot(text2, p1_words)

# problem 2
V = set(text5)
wordsBeginWithT = [w for w in V if (len(w) == 5) and ((w[0] == 't') or (w[0] == 'T'))]
print(sorted(wordsBeginWithT))

from nltk import FreqDist

fdist = FreqDist(w for w in text5 if len(w) == 5)
print(fdist.most_common())

# problem 3
lista = sorted(w for w in set(text2) if w.endswith('er'))
listb = sorted(w for w in set(text2) if 'm' in w)
listc = sorted(w for w in set(text2) if 'ph' in w)
listd = sorted(w for w in set(text2) if w.istitle())
listall = lista + listb + listc + listd
print(listall)
示例#15
0
文件: nltk.py 项目: yanagik/NLTK
text1.concordance("monstrous")

# What other words appear in a similar range of contexts?
text1.similar("monstrous")

# Examine contexts shared by two or more words
text2.common_contexts(["monstrous", "very"])

# Determine location of a word in the context, or how many words from the beginning it appears
# Note: As of November 13, 2016, the book's example code does not work.
# http://stackoverflow.com/questions/25182140/dispersion-plot-not-working-inspite-of-installing-matplotlib

from nltk.draw.dispersion import dispersion_plot

words = ["citizens", "democracy", "freedom", "duties", "America"]
dispersion_plot(text4, words)

# Find out the length of a text, in terms of words and punctuation
len(text3)

# len() returns a number of tokens, or a sequence of characters to be treated as a group

# Obtain vocabulary of a text, or the set of tokens used
sorted(set(text3))

len(set(text3))

# Compute lexical richness defined as percentage of unique words
len(set(text3)) / len(text3)

# Count how often a word occurs