def install(): REQUIRED_CORPORA = [ 'brown', # Required for FastNPExtractor 'punkt', # Required for WordTokenizer 'wordnet', # Required for lemmatization and Wordnet 'maxent_ne_chunker', 'stopwords', 'words' ] for each in REQUIRED_CORPORA: print(('[+] Downloading corpus: "{0}"'.format(each))) nltk.download(each) from metadoc.extract.pos import do_train print('[+] Training tagger now.') do_train()
def install_nltk_sets(): DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data") REQUIRED_CORPORA = [ 'brown', # Required for FastNPExtractor 'punkt', # Required for WordTokenizer 'wordnet', # Required for lemmatization and Wordnet 'maxent_ne_chunker', 'stopwords', 'words' ] for each in REQUIRED_CORPORA: print(('[+] Downloading corpus: "{0}"'.format(each))) nltk.download(each, download_dir=DATA_DIR) from metadoc.extract.pos import do_train print('[+] Training tagger now.') do_train() remove_zips(DATA_DIR) return
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from metadoc.extract.pos import do_train do_train()
def test_get_all_local(self): do_train() self.extractor.get_all() assert self.extractor.contenthash == "2b374ca41d42bd582e500e6cdbc936ef" assert self.extractor.title == "Some Fake News Publishers Just Happen to Be Donald Trump’s Cronies"
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import nltk from metadoc.extract.pos import do_train REQUIRED_CORPORA = [ 'brown', # Required for FastNPExtractor 'punkt', # Required for WordTokenizer 'wordnet', # Required for lemmatization and Wordnet 'maxent_ne_chunker', 'stopwords', 'words' ] for each in REQUIRED_CORPORA: print(('Downloading "{0}"'.format(each))) nltk.download(each) do_train() # Averaged Perceptron Tagger