Exemplo n.º 1
0
def parse_and_save():
    en = spacy.load('en')
    reader = WikiReader(wikidump)
    records = reader.records()

    def section_texts_flat(records):
        while 1:
            try:
                record = next(records)
            except OSError as e:
                print('error: %s' % e)
            else:
                for section in record['sections']:
                    yield section['text']

    pipe = en.pipe(section_texts_flat(records),
                   n_threads=cpu_count(),
                   batch_size=1000)
    # pipe = (en(txt) for txt in section_texts_flat(records))
    preproc = Preprocessor(en.vocab)
    with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f:
        for i, doc in enumerate(tqdm.tqdm(pipe)):
            if len(doc._py_tokens) <= 7:
                # short sentences -- nah
                continue
            for sent in doc.sents:
                packed = preproc.pack(sent)
                f.write(packed)
            if i % 10000 == 0:
                print('i=%s, saving vocab' % i)
                save_vocab(en.vocab)
    save_vocab(en.vocab)
    import IPython
    IPython.embed()
class WikiReaderTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
        write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
        self.wikireader = WikiReader(wiki_fname)

    def test_texts(self):
        texts = list(self.wikireader.texts())
        for text in texts:
            self.assertIsInstance(text, unicode_type)

    def test_texts_min_len(self):
        texts = list(self.wikireader.texts(min_len=300))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.wikireader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_records(self):
        records = list(self.wikireader.records())
        for record in records:
            self.assertIsInstance(record, dict)

    def test_records_min_len(self):
        records = list(self.wikireader.records(min_len=300))
        self.assertEqual(len(records), 1)

    def test_records_limit(self):
        records = list(self.wikireader.records(limit=1))
        self.assertEqual(len(records), 1)

    def tearDown(self):
        shutil.rmtree(self.tempdir)
class WikiReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
        write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
        self.wikireader = WikiReader(wiki_fname)

    def test_texts(self):
        texts = list(self.wikireader.texts())
        for text in texts:
            self.assertIsInstance(text, unicode_type)

    def test_texts_min_len(self):
        texts = list(self.wikireader.texts(min_len=300))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.wikireader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_records(self):
        records = list(self.wikireader.records())
        for record in records:
            self.assertIsInstance(record, dict)

    def test_records_min_len(self):
        records = list(self.wikireader.records(min_len=300))
        self.assertEqual(len(records), 1)

    def test_records_limit(self):
        records = list(self.wikireader.records(limit=1))
        self.assertEqual(len(records), 1)

    def tearDown(self):
        shutil.rmtree(self.tempdir)