if not token.is_ascii: continue if token.pos_ in {u'NOUN', u'PROPN'}: words.append(token.lemma_) return words pool_size = 32 p = Pool(pool_size) wp = Wikipedia(lang='en', version='latest') with open("lemmatized_nouns/output.txt", "w+") as f: batch, batch_max = [], 2**14 for text in wp.texts(min_len=300): batch.append(text) if len(batch) >= batch_max: # Returns pool_size number of arrays of roughly (batch_max / pool_size) processed documents (each document represented in array form) results = p.map(process_mini_batch, (batch[i::pool_size] for i in range(pool_size))) for result in results: for entry in result: # Write each document on its own line f.write(' '.join([word.encode('utf-8') for word in entry]) + "\n") batch = []
def test_ioerror(self): dataset = Wikipedia(data_dir=self.tempdir) with self.assertRaises(IOError): _ = list(dataset.texts())
def test_ioerror(tmpdir): dataset = Wikipedia(data_dir=str(tmpdir)) with pytest.raises(IOError): _ = list(dataset.texts())