def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_corpora', dir=os.path.dirname( os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname)
def parse_and_save(): en = spacy.load('en') reader = WikiReader(wikidump) records = reader.records() def section_texts_flat(records): while 1: try: record = next(records) except OSError as e: print('error: %s' % e) else: for section in record['sections']: yield section['text'] pipe = en.pipe(section_texts_flat(records), n_threads=cpu_count(), batch_size=1000) # pipe = (en(txt) for txt in section_texts_flat(records)) preproc = Preprocessor(en.vocab) with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f: for i, doc in enumerate(tqdm.tqdm(pipe)): if len(doc._py_tokens) <= 7: # short sentences -- nah continue for sent in doc.sents: packed = preproc.pack(sent) f.write(packed) if i % 10000 == 0: print('i=%s, saving vocab' % i) save_vocab(en.vocab) save_vocab(en.vocab) import IPython IPython.embed()