def setUp(self): super(TestCorpusCombiner, self).setUp() # 3 documents p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c', level='file', src_path=self.basepath) self.corpus1 = SnapshotCorpus(repo=self.repo, project=p1, remove_stops=False, lower=True, split=True, min_len=0) self.docs1 = list(self.corpus1) # 3 old documents + 2 new documents p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', level='file', src_path=self.basepath) self.corpus2 = SnapshotCorpus(repo=self.repo, project=p2, remove_stops=False, lower=True, split=True, min_len=0) self.docs2 = list(self.corpus2) self.corpus = CorpusCombiner([self.corpus1, self.corpus2]) self.docs = list(self.corpus)
def setUp(self): super(TestSnapshotCorpus, self).setUp() self.corpus = SnapshotCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus)
def setUp(self): super(TestSnapshotCorpusAtRef, self).setUp() p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9', level='file', src_path=self.basepath) self.corpus = SnapshotCorpus(repo=self.repo, project=p1, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus)
def test_lazy(self): corpus = SnapshotCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0, lazy_dict=True) self.assertEqual(len(corpus.id2word), 0) # if lazy, iterating over the corpus will now build the dict docs = list(corpus) self.assertGreater(len(corpus.id2word), 0)