from collections import Counter
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

from nlpia.data.loaders import kite_text
tokens = tokenizer.tokenize(kite_text.lower())

token_counts = Counter(tokens)
token_counts

# remove common stopwords
import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
tokens = [x for x in tokens if x not in stopwords]
kite_counts = Counter(tokens)

kite_counts

kite_counts.most_common(10)
示例#2
0
from nlpia.data.loaders import kite_text, kite_history
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter

tokenizer = TreebankWordTokenizer()
#tworzymy korpus składający się z dwóch tekstów poświęconych latawcom
kite_intro = kite_text.lower()
kite_history = kite_history.lower()

#obydwa dokumenty w korpusie dzielimy na tokeny
kite_intro_tokens = tokenizer.tokenize(kite_intro)
kite_history_tokens = tokenizer.tokenize(kite_history)

#zliczamy poszczególne słowa
intro_counts = Counter(kite_intro_tokens)
history_counts = Counter(kite_history_tokens)

#długości wektorów
intro_tokens_total = len(kite_intro_tokens)
history_tokens_total = len(kite_history_tokens)

intro_tf = {}
history_tf = {}

#obliczamy TF słowa 'kite' dla każdego z dokumentów
intro_tf['kite'] = intro_counts['kite'] / intro_tokens_total
history_tf['kite'] = history_counts['kite'] / history_tokens_total

#obliczamy TF słowa 'and' dla każdego z dokumentów
intro_tf['and'] = intro_counts['and'] / intro_tokens_total
history_tf['and'] = history_counts['and'] / history_tokens_total