Exemplo n.º 1
0
def build_vocab_df(docs,
                   min_len=2,
                   stopwords=None,
                   tokenizer=tokenize,
                   min_dc=2,
                   max_df=0.5):
    vocab = build_vocab(docs, min_len, stopwords,
                        tokenizer)  # Build the initial vocabulary
    dcount = build_dcount(docs, vocab)  # Calculate the DF
    max_dc = max_df * len(docs)
    selected = []
    for idx, (word, _) in enumerate(vocab.items()):
        if dcount[idx] >= min_dc and dcount[
                idx] <= max_dc:  # Check two DF conditions
            selected.append(word)
    vocab = {word: idx
             for idx, word in enumerate(selected)}  # Re-build the vocabulary
    return vocab
Exemplo n.º 2
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from nlp02_onehot_word import build_vocab
from nlp02_onehot_doc import inverse_vocab
from nlp02_bow_hand import build_dcount

# Load the 20 newsgroup dataset
remove = ('headers', 'footers', 'quotes')
train = datasets.fetch_20newsgroups(subset='train', remove=remove)

# Build a vocaburary and its document count
vocab = build_vocab(train.data)
vocab_inv = inverse_vocab(vocab)
dcount = build_dcount(train.data, vocab)

# Print statistics of the vocabulary
print('### Statistics of the vocabulary')
print(f'* The number of documents: {len(train.data)}')
print(f'* The size of vocabulary: {len(vocab)}')
print(
    f'* The averaged number of new words per a document: {len(vocab) / len(train.data):.3f}'
)
print(f'* The range of document counts: ({dcount.min()}, {dcount.max()})')
print(f'* The average of document counts: {dcount.mean():.3f}')

# Plot the histogram of rare (low document count) words
fig = plt.figure()
dcount10 = dcount[dcount < 10]
plt.hist(dcount10, bins=9, range=(1, 10), align='left')
plt.ylim(0, len(vocab))