Exemplo n.º 1
0
        if not token.is_ascii:
            continue
        if token.pos_ in {u'NOUN', u'PROPN'}:
            words.append(token.lemma_)

    return words


pool_size = 32

p = Pool(pool_size)

wp = Wikipedia(lang='en', version='latest')

with open("lemmatized_nouns/output.txt", "w+") as f:
    batch, batch_max = [], 2**14

    for text in wp.texts(min_len=300):
        batch.append(text)
        if len(batch) >= batch_max:
            # Returns pool_size number of arrays of roughly (batch_max / pool_size) processed documents (each document represented in array form)
            results = p.map(process_mini_batch,
                            (batch[i::pool_size] for i in range(pool_size)))
            for result in results:
                for entry in result:
                    # Write each document on its own line
                    f.write(' '.join([word.encode('utf-8')
                                      for word in entry]) + "\n")

            batch = []
Exemplo n.º 2
0
 def test_ioerror(self):
     dataset = Wikipedia(data_dir=self.tempdir)
     with self.assertRaises(IOError):
         _ = list(dataset.texts())
def test_ioerror(tmpdir):
    dataset = Wikipedia(data_dir=str(tmpdir))
    with pytest.raises(IOError):
        _ = list(dataset.texts())