示例#1
0
def install():
    REQUIRED_CORPORA = [
        'brown',  # Required for FastNPExtractor
        'punkt',  # Required for WordTokenizer
        'wordnet',  # Required for lemmatization and Wordnet
        'maxent_ne_chunker',
        'stopwords',
        'words'
    ]

    for each in REQUIRED_CORPORA:
        print(('[+] Downloading corpus:  "{0}"'.format(each)))
        nltk.download(each)

    from metadoc.extract.pos import do_train
    print('[+] Training tagger now.')
    do_train()
示例#2
0
def install_nltk_sets():
    DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data")
    REQUIRED_CORPORA = [
        'brown',  # Required for FastNPExtractor
        'punkt',  # Required for WordTokenizer
        'wordnet',  # Required for lemmatization and Wordnet
        'maxent_ne_chunker',
        'stopwords',
        'words'
    ]

    for each in REQUIRED_CORPORA:
        print(('[+] Downloading corpus:  "{0}"'.format(each)))
        nltk.download(each, download_dir=DATA_DIR)

    from metadoc.extract.pos import do_train
    print('[+] Training tagger now.')
    do_train()
    remove_zips(DATA_DIR)
    return
示例#3
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from metadoc.extract.pos import do_train
do_train()
示例#4
0
 def test_get_all_local(self):
     do_train()
     self.extractor.get_all()
     assert self.extractor.contenthash == "2b374ca41d42bd582e500e6cdbc936ef"
     assert self.extractor.title == "Some Fake News Publishers Just Happen to Be Donald Trump’s Cronies"
示例#5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import nltk
from metadoc.extract.pos import do_train

REQUIRED_CORPORA = [
  'brown', # Required for FastNPExtractor
  'punkt', # Required for WordTokenizer
  'wordnet', # Required for lemmatization and Wordnet
  'maxent_ne_chunker',
  'stopwords',
  'words' 
]

for each in REQUIRED_CORPORA:
  print(('Downloading "{0}"'.format(each)))
  nltk.download(each)

do_train() # Averaged Perceptron Tagger