Пример #1
0
# Counting the number of characters in each word in a text
[len(w) for w in text1]

# Bigram function returns a list of bigrams
from nltk import bigrams, trigrams

bigrams(myText2)

trigrams(myText2)

bigramsText1 = bigrams(
    text1)  # bigramsText1[0] is the tuple containing the first bigram

# Collocations are frequent bigrams from words that are not so common as unigrams.
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys()  # The different word lengths
fdistWordLength.values()  # The frequency of each word length
fdistWordLength.items()  # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the')  # Frequency of the word ‘the’
fdist1.max()

# String methods

s = "MatTias"
Пример #2
0
myText1 + myText2

# Adding a word to a list (appending a word)
myText.append("LOL")

# We can find the FIRST position of given word:
myText.index('about')


# Counting the number of characters in each word in a text
[len(w) for w in text1]

# Collocations are frequent bigrams from words that are not so common as unigrams. 
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys() # The different word lengths
fdistWordLength.values() # The frequency of each word length
fdistWordLength.items() # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the') # Frequency of the word ‘the’
fdist1.max()



#### MOVIE REVIEWS ####
Пример #3
0
# nltk.download()

print '===============查找关键词=================='
t1.concordance("america")

print '===============查找相似上下文==============='
t1.similar("america")

print '=============共同的语法结构================='
t1.common_contexts(['in', 'of'])

print '=================词汇分布图================='
t4.dispersion_plot(['citizens', 'democaracy', 'freedom', 'america'])

print '=================统计最常出现的词================'
freList = nk.FreqDist(t1)
freList.plot(50, cumulative=False)

print '=================统计长度超过15的词==============='
v = set(t1)
long_words = filter(lambda x: len(x) > 15, v)[:10]
print long_words

print '=================常用双连词搭配==============='
tuple = nk.bigrams(['all', 'in', 'of', 'take', 'like'])
for x in tuple:
    print x

print '=================基于语料的双连词搭配==============='
t1.collocations()
Пример #4
0
from nltk.book import text1
from nltk.book import text4
from nltk.book import text6

print(text1.concordance("monstrous"))
print(text1.similar("monstrous"))
print(text1.collocations())
text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "duties", "America"])

print(text6.count("Very"))
print(text6.count('the') / float(len(text6)) * 100)
print(text4.count("bless"))
print(text4[100])
print(text4.index('the'))
print(text4[524])
print(text4.index('men'))
print(text4[0:len(text4)])
Пример #5
0
import nltk
from nltk.book import text1

# Bigrams - pairs of sequential words
print list(nltk.bigrams('Hello world! How are you?'.split(' ')))

# Collocations - pairs of words that appear frequently
print text1.collocations()