def test_getitem_dense2gensim(self): corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue( iscorp, "Is the object returned by slice notation " "a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in xrange(len(ilist)): self.assertEqual( len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in xrange(len(ilist[i])): self.assertEqual( ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue( iscorp, "Is the object returned by list notation " "a gensim corpus?")
def test_getitem_dense2gensim(self): corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue(iscorp, "Is the object returned by slice notation " "a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in xrange(len(ilist)): self.assertEqual(len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in xrange(len(ilist[i])): self.assertEqual(ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % ( i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue(iscorp, "Is the object returned by list notation " "a gensim corpus?")
def setUp(self): self.dim = 1000 self.random_string = ''.join([random.choice('1234567890') for _ in xrange(8)]) self.tmp_dir = 'test-temp-' + self.random_string os.makedirs(self.tmp_dir) self.tmp_fname = os.path.join(self.tmp_dir, 'shcorp.' + self.random_string + '.tmp') self.data = mock_data(dim=1000) self.corpus = ShardedCorpus(self.tmp_fname, self.data, dim=self.dim, shardsize=100)
def test_getitem(self): _ = self.corpus[130] # Does retrieving the item load the correct shard? self.assertEqual(self.corpus.current_shard_n, 1) item = self.corpus[220:227] self.assertEqual((7, self.corpus.dim), item.shape) self.assertEqual(self.corpus.current_shard_n, 2) for i in xrange(220, 227): self.assertTrue(np.array_equal(self.corpus[i], item[i-220]))
def test_resize(self): dataset = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim) self.assertEqual(10, dataset.n_shards) dataset.resize_shards(250) self.assertEqual(4, dataset.n_shards) for n in xrange(dataset.n_shards): fname = dataset._shard_name(n) self.assertTrue(os.path.isfile(fname))
def test_getitem(self): _ = self.corpus[130] # Does retrieving the item load the correct shard? self.assertEqual(self.corpus.current_shard_n, 1) item = self.corpus[220:227] self.assertEqual((7, self.corpus.dim), item.shape) self.assertEqual(self.corpus.current_shard_n, 2) for i in xrange(220, 227): self.assertTrue(numpy.array_equal(self.corpus[i], item[i - 220]))