def setUp(self): super(TestChangesetCorpus, self).setUp() self.corpus = ChangesetCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus)
def setUp(self): self.basepath = datapath(u'multitext_git/') if not os.path.exists(self.basepath): extraction_path = datapath('') gz = datapath(u'multitext_git.tar.gz') import tarfile with tarfile.open(gz) as tar: tar.extractall(extraction_path) self.repo = dulwich.repo.Repo(self.basepath) self.corpus = ChangesetCorpus(self.repo, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus)
class TestChangesetCorpus(TestGitCorpus): def setUp(self): super(TestChangesetCorpus, self).setUp() self.corpus = ChangesetCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus) def test_length(self): self.assertEqual(len(self.corpus), 5) self.assertEqual(len(self.docs), 5) l = len(self.corpus) for _ in self.corpus: self.assertEqual(l, len(self.corpus)) def test_lazy(self): corpus = SnapshotCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0, lazy_dict=True) self.assertEqual(len(corpus.id2word), 0) # if lazy, iterating over the corpus will now build the dict docs = list(corpus) self.assertGreater(len(corpus.id2word), 0) def test_changeset_get_texts(self): documents = [ # systems [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'], # a/ [ u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications' ] + [ u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time' ], # b/ [u'the', u'eps', u'user', u'interface', u'management', u'system'] + [ u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps' ], # c/ # TODO apparently file c/4.txt is fubar in the test repo # [u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [ u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees' ] + [ u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees' ], # 7 [ u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering' ], ] documents = [list(sorted(x)) for x in documents] for i, doc in enumerate(self.corpus.get_texts()): doc = list(sorted(doc)) # generators, woo? self.assertEqual(doc, documents[i]) def test_changeset_metadata_get_texts(self): self.corpus.metadata = True documents = [ # systems ([u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'], (u'2aeb2e7c78259833e1218b69f99dab3acd00970c', u'test_git')), # a/ ([ u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications' ] + [ u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time' ], (u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', u'test_git')), # b/ ([u'the', u'eps', u'user', u'interface', u'management', u'system'] + [ u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps' ], (u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9', u'test_git')), # c/ # TODO apparently file c/4.txt is fubar in the test repo ( #[u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [ u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees' ] + [ u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees' ], (u'899268bdd33aec225f6264a734dac2081f78ab54', u'test_git')), # 7 ([ u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering' ], (u'f870a217765a268fe5c5315d58ef671050d17fb9', u'test_git')), ] documents = [(list(sorted(x[0])), x[1]) for x in documents] for docmeta in self.corpus.get_texts(): doc, meta = docmeta doc = list(sorted(doc)) # generators, woo? docmeta = doc, meta # get a non (generator, metadata) pair self.assertIn(docmeta, documents) def test_changeset_docs(self): documents = [ [ (u'and', 1), (u'graph', 1), (u'iv', 1), (u'minors', 1), (u'of', 1), (u'ordering', 1), (u'quasi', 1), (u'trees', 1), (u'well', 1), (u'widths', 1), ], [ (u'binary', 1), # (u'error', 1), (u'generation', 1), (u'graph', 1), (u'in', 1), (u'intersection', 1), # (u'measurement', 1), # (u'of', 3), (u'of', 2), (u'paths', 1), # (u'perceived', 1), (u'random', 1), # (u'relation', 1), # (u'response', 1), (u'the', 2), # (u'time', 1), # (u'to', 1), (u'trees', 2), (u'unordered', 1), # (u'user', 1), ], # [u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [ (u'and', 1), (u'engineering', 1), (u'eps', 2), (u'human', 1), (u'interface', 1), (u'management', 1), (u'of', 1), (u'system', 3), (u'testing', 1), (u'the', 1), (u'user', 1), ], [ (u'a', 1), (u'abc', 1), (u'applications', 1), (u'computer', 2), (u'for', 1), (u'human', 1), (u'interface', 1), (u'lab', 1), (u'machine', 1), (u'of', 2), (u'opinion', 1), (u'response', 1), (u'survey', 1), (u'system', 1), (u'time', 1), (u'user', 1), ], [ (u'a', 3), (u'graph', 3), (u'minors', 3), (u'survey', 3), ], ] documents = [set(x) for x in documents] for doc in self.corpus: self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set( (unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc) self.assertIn(textdoc, documents)
class TestChangesetCorpus(TestGitCorpus): def setUp(self): super(TestChangesetCorpus, self).setUp() self.corpus = ChangesetCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0) self.docs = list(self.corpus) def test_length(self): self.assertEqual(len(self.corpus), 5) self.assertEqual(len(self.docs), 5) l = len(self.corpus) for _ in self.corpus: self.assertEqual(l, len(self.corpus)) def test_lazy(self): corpus = SnapshotCorpus(repo=self.repo, remove_stops=False, lower=True, split=True, min_len=0, lazy_dict=True) self.assertEqual(len(corpus.id2word), 0) # if lazy, iterating over the corpus will now build the dict docs = list(corpus) self.assertGreater(len(corpus.id2word), 0) def test_changeset_get_texts(self): documents = [ # systems [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'], # a/ [u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'] + [u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'], # b/ [u'the', u'eps', u'user', u'interface', u'management', u'system'] + [u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'], # c/ # TODO apparently file c/4.txt is fubar in the test repo # [u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees'] + [u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees'], # 7 [u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering'], ] documents = [list(sorted(x)) for x in documents] for i, doc in enumerate(self.corpus.get_texts()): doc = list(sorted(doc)) # generators, woo? self.assertEqual(doc, documents[i]) def test_changeset_metadata_get_texts(self): self.corpus.metadata = True documents = [ # systems ( [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'] + [u'graph', u'minors', u'a', u'survey'], (u'2aeb2e7c78259833e1218b69f99dab3acd00970c', u'test_git')), # a/ ( [u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'] + [u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'], (u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', u'test_git')), # b/ ( [u'the', u'eps', u'user', u'interface', u'management', u'system'] + [u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'], (u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9', u'test_git')), # c/ # TODO apparently file c/4.txt is fubar in the test repo ( #[u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees'] + [u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees'], (u'899268bdd33aec225f6264a734dac2081f78ab54', u'test_git')), # 7 ( [u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering'], (u'f870a217765a268fe5c5315d58ef671050d17fb9', u'test_git')), ] documents = [(list(sorted(x[0])), x[1]) for x in documents] for docmeta in self.corpus.get_texts(): doc, meta = docmeta doc = list(sorted(doc)) # generators, woo? docmeta = doc, meta # get a non (generator, metadata) pair self.assertIn(docmeta, documents) def test_changeset_docs(self): documents = [ [ (u'and', 1), (u'graph', 1), (u'iv', 1), (u'minors', 1), (u'of', 1), (u'ordering', 1), (u'quasi', 1), (u'trees', 1), (u'well', 1), (u'widths', 1), ], [ (u'binary', 1), # (u'error', 1), (u'generation', 1), (u'graph', 1), (u'in', 1), (u'intersection', 1), # (u'measurement', 1), # (u'of', 3), (u'of', 2), (u'paths', 1), # (u'perceived', 1), (u'random', 1), # (u'relation', 1), # (u'response', 1), (u'the', 2), # (u'time', 1), # (u'to', 1), (u'trees', 2), (u'unordered', 1), # (u'user', 1), ], # [u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'] + [ (u'and', 1), (u'engineering', 1), (u'eps', 2), (u'human', 1), (u'interface', 1), (u'management', 1), (u'of', 1), (u'system', 3), (u'testing', 1), (u'the', 1), (u'user', 1), ], [ (u'a', 1), (u'abc', 1), (u'applications', 1), (u'computer', 2), (u'for', 1), (u'human', 1), (u'interface', 1), (u'lab', 1), (u'machine', 1), (u'of', 2), (u'opinion', 1), (u'response', 1), (u'survey', 1), (u'system', 1), (u'time', 1), (u'user', 1), ], [ (u'a', 3), (u'graph', 3), (u'minors', 3), (u'survey', 3), ], ] documents = [set(x) for x in documents] for doc in self.corpus: self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set((unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc) self.assertIn(textdoc, documents)