예제 #1
0
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# Quantas vezes a palavra form aparece no corpus?
freq_dist.count('form')
# Qual é a freqüência da palavra form?
freq_dist.freq('form')
# Quantas palavras foram contadas?
freq_dist.N()
# Quais foram os tipos de palavras encontradas?
freq_dist.samples()
# Qual é a palavra mais comum?
freq_dist.max()
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# How many times did "the" occur?
freq_dist.count('the')

# What was the frequency of the word "the"?
freq_dist.freq('the')

# How many word tokens were counted?
freq_dist.N()

# What word types were encountered?
freq_dist.samples()

# What was the most common word?
freq_dist.max()

# What is the distribution of word lengths in a corpus?
freq_dist = FreqDist()
예제 #3
0
# Um exemplo da lei de Zipf

from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot
freq_dist = FreqDist()

corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)

for token in corpus['SUBTOKENS']:
    freq_dist.inc(token['TEXT'])

wordcount = freq_dist.samples()
#points = [(freq_dist.freq(l),l) for l in wordcount]
#points.sort()

x = 0
points = list(wordcount)
for l in wordcount:
    points[x] = (freq_dist.count(l), x)
    x = x + 1
points.sort()

print points
Plot(points)