def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)

        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpus, self).setUp()
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpusAtRef, self).setUp()
     p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9',
                       level='file',
                       src_path=self.basepath)
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  project=p1,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)