def setUp(self):
     self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
     wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
     write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
     self.wikireader = WikiReader(wiki_fname)
Пример #2
0
def parse_and_save():
    en = spacy.load('en')
    reader = WikiReader(wikidump)
    records = reader.records()

    def section_texts_flat(records):
        while 1:
            try:
                record = next(records)
            except OSError as e:
                print('error: %s' % e)
            else:
                for section in record['sections']:
                    yield section['text']

    pipe = en.pipe(section_texts_flat(records),
                   n_threads=cpu_count(),
                   batch_size=1000)
    # pipe = (en(txt) for txt in section_texts_flat(records))
    preproc = Preprocessor(en.vocab)
    with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f:
        for i, doc in enumerate(tqdm.tqdm(pipe)):
            if len(doc._py_tokens) <= 7:
                # short sentences -- nah
                continue
            for sent in doc.sents:
                packed = preproc.pack(sent)
                f.write(packed)
            if i % 10000 == 0:
                print('i=%s, saving vocab' % i)
                save_vocab(en.vocab)
    save_vocab(en.vocab)
    import IPython
    IPython.embed()
Пример #3
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     wiki_fname =os.path.join(self.tempdir, 'wikitext.xml.bz2')
     with bzip_open(wiki_fname, mode='w') as f:
         f.write(WIKITEXT)
     self.wikireader = WikiReader(wiki_fname)
Пример #4
0
class WikiReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        wiki_fname =os.path.join(self.tempdir, 'wikitext.xml.bz2')
        with bzip_open(wiki_fname, mode='w') as f:
            f.write(WIKITEXT)
        self.wikireader = WikiReader(wiki_fname)

    def test_texts(self):
        texts = list(self.wikireader.texts())
        for text in texts:
            self.assertIsInstance(text, str)

    def test_texts_min_len(self):
        texts = list(self.wikireader.texts(min_len=300))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.wikireader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_pages(self):
        pages = list(self.wikireader.pages())
        for page in pages:
            self.assertIsInstance(page, dict)

    def test_pages_min_len(self):
        pages = list(self.wikireader.pages(min_len=300))
        self.assertEqual(len(pages), 1)

    def test_pages_limit(self):
        pages = list(self.wikireader.pages(limit=1))
        self.assertEqual(len(pages), 1)

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)
class WikiReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
        write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
        self.wikireader = WikiReader(wiki_fname)

    def test_texts(self):
        texts = list(self.wikireader.texts())
        for text in texts:
            self.assertIsInstance(text, unicode_type)

    def test_texts_min_len(self):
        texts = list(self.wikireader.texts(min_len=300))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.wikireader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_pages(self):
        pages = list(self.wikireader.pages())
        for page in pages:
            self.assertIsInstance(page, dict)

    def test_pages_min_len(self):
        pages = list(self.wikireader.pages(min_len=300))
        self.assertEqual(len(pages), 1)

    def test_pages_limit(self):
        pages = list(self.wikireader.pages(limit=1))
        self.assertEqual(len(pages), 1)

    def tearDown(self):
        shutil.rmtree(self.tempdir)
class WikiReaderTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
        write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
        self.wikireader = WikiReader(wiki_fname)

    def test_texts(self):
        texts = list(self.wikireader.texts())
        for text in texts:
            self.assertIsInstance(text, unicode_type)

    def test_texts_min_len(self):
        texts = list(self.wikireader.texts(min_len=300))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.wikireader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_records(self):
        records = list(self.wikireader.records())
        for record in records:
            self.assertIsInstance(record, dict)

    def test_records_min_len(self):
        records = list(self.wikireader.records(min_len=300))
        self.assertEqual(len(records), 1)

    def test_records_limit(self):
        records = list(self.wikireader.records(limit=1))
        self.assertEqual(len(records), 1)

    def tearDown(self):
        shutil.rmtree(self.tempdir)
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
     write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
     self.wikireader = WikiReader(wiki_fname)