# Counting the number of characters in each word in a text [len(w) for w in text1] # Bigram function returns a list of bigrams from nltk import bigrams, trigrams bigrams(myText2) trigrams(myText2) bigramsText1 = bigrams( text1) # bigramsText1[0] is the tuple containing the first bigram # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() # String methods s = "MatTias"
myText1 + myText2 # Adding a word to a list (appending a word) myText.append("LOL") # We can find the FIRST position of given word: myText.index('about') # Counting the number of characters in each word in a text [len(w) for w in text1] # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() #### MOVIE REVIEWS ####
# nltk.download() print '===============查找关键词==================' t1.concordance("america") print '===============查找相似上下文===============' t1.similar("america") print '=============共同的语法结构=================' t1.common_contexts(['in', 'of']) print '=================词汇分布图=================' t4.dispersion_plot(['citizens', 'democaracy', 'freedom', 'america']) print '=================统计最常出现的词================' freList = nk.FreqDist(t1) freList.plot(50, cumulative=False) print '=================统计长度超过15的词===============' v = set(t1) long_words = filter(lambda x: len(x) > 15, v)[:10] print long_words print '=================常用双连词搭配===============' tuple = nk.bigrams(['all', 'in', 'of', 'take', 'like']) for x in tuple: print x print '=================基于语料的双连词搭配===============' t1.collocations()
from nltk.book import text1 from nltk.book import text4 from nltk.book import text6 print(text1.concordance("monstrous")) print(text1.similar("monstrous")) print(text1.collocations()) text4.dispersion_plot( ["citizens", "democracy", "freedom", "duties", "America"]) print(text6.count("Very")) print(text6.count('the') / float(len(text6)) * 100) print(text4.count("bless")) print(text4[100]) print(text4.index('the')) print(text4[524]) print(text4.index('men')) print(text4[0:len(text4)])
import nltk from nltk.book import text1 # Bigrams - pairs of sequential words print list(nltk.bigrams('Hello world! How are you?'.split(' '))) # Collocations - pairs of words that appear frequently print text1.collocations()