Exemplo n.º 1
0
# Read corpus and extract key phrases.
def worker(text):
    doc = nlp(text)
    phrases = [p.text for p in doc._.phrases]
    num_words = len(doc)
    return phrases, num_words


p = Pool(options.nproc)
total_words = 0
vocab = collections.Counter()
for phrases, num_words in tqdm(p.imap(worker, corpus)):
    # Note: This count include punctuation as well as words.
    total_words += num_words

    # examine the top-ranked phrases in the document
    seen = 0
    for i, p in enumerate(phrases):
        if len(p.split()) == 1:
            continue
        # print("{:.3f} {}".format(p.rank, p.text))
        vocab[p] += 1
        seen += 1
        if options.maxphrases_per_doc > 0 and seen == options.maxphrases_per_doc:
            break

for k in sorted(vocab.keys()):
    print('{} {}'.format(k, vocab[k]))
print('corpus-size={} total-words={} total-vocab={}'.format(
    len(corpus), total_words, len(vocab)))