def setUp(self): self.tensor = DictTensor(2) self.tensor.update( nested_list_to_dict(numpy.random.random_sample((10, 12)))) self.normalized_tensor = self.tensor.normalized() self.svd = self.normalized_tensor.svd(k=3) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v
def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3) self.incremental = self.tensor.incremental_svd(k=3, niter=200) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v
def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3, offset_for_row=offset_for_row, offset_for_col=offset_for_col) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v
def testAdd(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[0, 0] = 1 t1[1, 1] = 1 t1[1, 0] = 2 t2[2, 1] = 4 t2[1, 0] = 5 t3 = t1 + t2 assertTensorEqual(t3, [[1, None], [7, 1], [None, 4]])
class SVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3, offset_for_row=offset_for_row, offset_for_col=offset_for_col) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.u, [[0, 0, 1], [0, -1, 0], [0, 0, 0], [-1, 0, 0]], abs=True) assertTensorEqual(self.v, [[0, 0, sqrt(.2)], [-1, 0, 0], [0, -1, 0], [0, 0, 0], [0, 0, sqrt(.8)]], abs=True) assertTensorEqual(self.svals, [4, 3, sqrt(5)]) def test_reconstructed(self): assertTensorEqual(self.svd.reconstructed, [[1, 0, 0, 0, 2], [0, 0, 3, 0, 0], [0, 0, 0, 0, 0], [0, 4, 0, 0, 0]]) assertTensorEqual(self.svd.reconstructed[1,:], [0, 0, 3, 0, 0]) assertTensorEqual(self.svd.reconstructed[:,2], [0, 3, 0, 0]) def test_orthonormality(self): identity = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] assertTensorEqual(self.u.T * self.u, identity) assertTensorEqual(self.v.T * self.v, identity)
def load(cls, filebase): tensor = DictTensor.load(filebase) try: tensor = NormalizedView.load(filebase, tensor) except IOError: pass return super(SparseLabeledTensor,cls).load(filebase, tensor)
def test_transposed(self): '''Run the same testcase, but with the matrix transposed.''' tensor = DictTensor(2) # Consider a document containing 100 words wherein the word cow appears 3 times. # [specifically, let there be a document where 'cow' appears 3 times # and 'moo' appears 97 times] doc = 0 cow = 1 moo = 2 tensor[doc, cow] = 3 tensor[doc, moo] = 97 # Following the previously defined formulas, the term frequency (TF) for cow is then 0.03 (3 / 100). tfidf = TfIdfView( tensor, transposed=True) # (can't create it earlier b/c it's read-only) self.assertEqual(tfidf.counts_for_document[doc], 100) self.assertAlmostEqual(tfidf.tf(cow, doc), 0.03) # Now, assume we have 10 million documents and cow appears in one thousand of these. # [specifically, let 'cow' appear in documents 0 and 10,000,000-1000+1 till 10,000,000 for doc in xrange(10000000 - 1000 + 1, 10000000): tensor[doc, cow] = 1 # Then, the inverse document frequency is calculated as ln(10 000 000 / 1 000) = 9.21. tfidf = TfIdfView( tensor, transposed=True) # (have to update after adding the other docs) self.assertEqual(tfidf.num_documents, 10000000) self.assertEqual(tfidf.num_docs_that_contain_term[cow], 1000) self.assertAlmostEqual(tfidf.idf(cow), 9.21, 2) # The TF-IDF score is the product of these quantities: 0.03 * 9.21 = 0.28. score = tfidf[0, cow] self.assertEqual(len(getattr(score, 'shape', ())), 0) self.assertAlmostEqual(score, 0.28, 2)
def weight_feature_vector(vec, weight_dct, default_weight=0.0): ''' Weights a feature vector by relation. vec: a feature vector (e.g., a slice of a reconstructed tensor) weight_dct: a mapping from (side, relation) tuples to weights, where side is 'left' or 'right'. default_weight: the weight to give entries that are not specified. Example: >>> from csc.conceptnet4.analogyspace import conceptnet_2d_from_db >>> t = conceptnet_2d_from_db('en') >>> svd = t.svd() >>> baseball = svd.reconstructed['baseball',:] >>> weights = {} >>> weights['right', 'IsA'] = 1.0 >>> weights['right', 'AtLocation'] = 0.8 >>> weight_feature_vector(baseball, weights).top_items() ''' if vec.ndim != 1: raise TypeError('Feature vectors can only have one dimension') res = LabeledView(DictTensor(ndim=1), label_lists=vec.label_lists()) for k, v in vec.iteritems(): res[k] = v*weight_dct.get(k[0][:2], default_weight) return res
def svd(self, k=50, normalized=True): '''Run an SVD on this unfolding. Compacts, runs, and returns an SVD2DResults.''' # Set up a LabeledView to map column indices from unfolded products # to unique indices. col_indices = OrderedSet() compact = LabeledView(DictTensor(2), [IdentitySet(0), col_indices]) self.compact_to(compact) if normalized: compact = compact.normalized(mode=0) svd = compact.svd(k) # Wrap the output so that the labeling all works out. if hasattr(self.tensor, '_labels'): # Case for labeled view beneath # FIXME: try not to rely on private vars. # TODO: it would be nice to factor this in such a way that we # didn't have to worry about the labeling case here. u = LabeledView(svd.u, [self.tensor._labels[self.dim], None]) v = LabeledView(svd.v, [ UnfoldedSet.from_unfolding(self.dim, self.tensor.label_sets()), None ]) else: u = svd.u v = LabeledView(svd.v, [ UnfoldedSet.from_unfolding( self.dim, [IdentitySet(dim) for dim in self.tensor.shape]), None ]) from csc.divisi.svd import SVD2DResults return SVD2DResults(u, v, svd.svals)
def load(cls, filebase): tensor = DictTensor.load(filebase) try: tensor = NormalizedView.load(filebase, tensor) except IOError: pass return super(SparseLabeledTensor, cls).load(filebase, tensor)
def setUp(self): self.tensor = DictTensor(2) self.tensor.update(nested_list_to_dict( numpy.random.random_sample((10, 12)))) self.normalized_tensor = self.tensor.normalized() self.svd = self.normalized_tensor.svd(k=3) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v
def __init__(self, *a, **kw): if 'ndim' in kw: ndim = kw.pop('ndim') data = DictTensor(ndim) label_lists = [OrderedSet() for i in xrange(ndim)] LabeledView.__init__(self, data, label_lists, *a, **kw) else: LabeledView.__init__(self, *a, **kw) self._slice_cache = {}
class UnfoldedSparseTensorTest(unittest.TestCase): def setUp(self): self.raw = DictTensor(3) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.raw[x1, x2, x3] = x1 * 100 + x2 * 10 + x3 def test_unfold0(self): uf = self.raw.unfolded(0) self.assertEqual(uf.shape, (2, 3 * 4)) self.assertEqual(len(uf), 2 * 3 * 4) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x1, (x2, x3)], x1 * 100 + x2 * 10 + x3) def test_unfold1(self): uf = self.raw.unfolded(1) self.assertEqual(uf.shape, (3, 2 * 4)) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x2, (x1, x3)], x1 * 100 + x2 * 10 + x3) def test_unfold2(self): uf = self.raw.unfolded(2) self.assertEqual(uf.shape, (4, 2 * 3)) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x3, (x1, x2)], x1 * 100 + x2 * 10 + x3) def test_compact0(self): uf = self.raw.unfolded(0) compact = DictTensor(2) uf.compact_to(compact) self.assertEqual(len(compact), 2 * 3 * 4) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(compact[x1, x2 * 4 + x3], x1 * 100 + x2 * 10 + x3)
def test_compact0(self): uf = self.raw.unfolded(0) compact = DictTensor(2) uf.compact_to(compact) self.assertEqual(len(compact), 2 * 3 * 4) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(compact[x1, x2 * 4 + x3], x1 * 100 + x2 * 10 + x3)
class UnfoldedSparseTensorTest(unittest.TestCase): def setUp(self): self.raw = DictTensor(3) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.raw[x1, x2, x3] = x1*100+x2*10+x3 def test_unfold0(self): uf = self.raw.unfolded(0) self.assertEqual(uf.shape, (2, 3*4)) self.assertEqual(len(uf), 2*3*4) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x1, (x2, x3)], x1*100+x2*10+x3) def test_unfold1(self): uf = self.raw.unfolded(1) self.assertEqual(uf.shape, (3, 2*4)) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x2, (x1, x3)], x1*100+x2*10+x3) def test_unfold2(self): uf = self.raw.unfolded(2) self.assertEqual(uf.shape, (4, 2*3)) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(uf[x3, (x1, x2)], x1*100+x2*10+x3) def test_compact0(self): uf = self.raw.unfolded(0) compact = DictTensor(2) uf.compact_to(compact) self.assertEqual(len(compact), 2*3*4) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.assertEqual(compact[x1, x2*4+x3], x1*100+x2*10+x3)
def test_combine_by_element(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[1, 1] = 1 t1[1, 0] = 2 t2[1, 1] = 4 t2[0, 1] = 5 t3 = t1.combine_by_element(t2, lambda x, y: x + (2*y)) assertTensorEqual(t3, [[None, 10], [2, 9]]) # Make sure errors are raised when the tensors don't have the # same shape or number of dimensions t4 = DictTensor(2) t4[0, 2] = 3 t4[1, 0] = 5 self.assertRaises(IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) t4 = DictTensor(3) self.assertRaises(IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y))
def build_tensor(self, tensor=None): ''' Build the combined tensor. Done explicitly because it's slow. If `tensor` is not None, it is used as the underlying numeric storage tensor. It should have the same number of dimensions as the blend. It defaults to a new DictTensor. ''' self.logger.info('building combined tensor.') labels = self._labels if tensor is None: tensor = DictTensor(ndim=self.ndim) assert tensor.ndim == self.ndim if self._keys_never_overlap: self.logger.info('fast-merging.') tensor.update((tuple( label_list.index(label) for label_list, label in izip(labels, key)), val) for key, val in self._fast_iteritems()) else: for factor, cur_tensor, name in zip(self._weights, self._tensors, self.names): self.logger.info('slow-merging %s' % name) for key, val in cur_tensor.iteritems(): tensor.inc( tuple( label_list.index(label) for label_list, label in izip(labels, key)), factor * val) self._tensor = tensor self.logger.info('done building tensor.')
def test_tensordot(self): if True: # FIXME XXX: skip this test. return # Test degenerate case of two 1-d vectors t1 = DictTensor(ndim=1) t2 = DictTensor(ndim=1) t1[0] = 1 t1[2] = 2 t2[0] = 3 t2[1] = 4 t2[2] = 5 self.assertEqual(13, t1.tensordot(t2, 0)) self.assertEqual(13, t1.tensordot(t2.to_dense(), 0)) self.assertEqual(13, t1.to_dense().tensordot(t2, 0)) self.assertEqual(13, t1.to_dense().tensordot(t2.to_dense(), 0)) for i in range(5): # Make a random, randomly-shaped 3D tensor shape = random.sample(xrange(1, 30), 3) tensor = DenseTensor(numpy.random.random(shape)) # Pick a random one of those dimensions dim = random.randrange(3) # Make a random vector of that length vec = DenseTensor(numpy.random.random((shape[dim], ))) # Try the dense result result = tensor.tensordot(vec, dim) self.assertEqual(result.shape, tuple(shape[:dim] + shape[dim + 1:])) # Try it with the tensor being sparse. sparset = tensor.to_sparse() result_s = sparset.tensordot(vec, dim) self.assertEqual(result_s.shape, result.shape) for key, val in result.iteritems(): self.assertAlmostEqual(val, result_s[key])
class NormalizedSVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) self.tensor.update( nested_list_to_dict(numpy.random.random_sample((10, 12)))) self.normalized_tensor = self.tensor.normalized() self.svd = self.normalized_tensor.svd(k=3) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) # Assert that the singular values are decreasing for i in range(1, len(self.svals)): self.assert_(self.svals[i] < self.svals[i - 1]) def test_reconstructed(self): pass # TODO def test_orthonormality(self): assertTensorEqual(self.u.T * self.u, numpy.eye(self.u.shape[1])) assertTensorEqual(self.v.T * self.v, numpy.eye(self.u.shape[1])) def test_variance(self): return # TODO # Assert that the SVD explained some of the variance. diff_k3 = self.tensor - self.svd.reconstructed tensor_mag = self.tensor.magnitude() diff_k3_mag = diff_k3.magnitude() self.assert_(tensor_mag > diff_k3_mag) # Check that a smaller SVD explains less of the variance, but still some. svd_k1 = self.tensor.svd(k=1) diff_k1 = self.tensor - svd_k1.reconstructed diff_k1_mag = diff_k1.magnitude() self.assert_(tensor_mag > diff_k1_mag > diff_k3_mag)
class NormalizedSVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) self.tensor.update(nested_list_to_dict( numpy.random.random_sample((10, 12)))) self.normalized_tensor = self.tensor.normalized() self.svd = self.normalized_tensor.svd(k=3) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) # Assert that the singular values are decreasing for i in range(1,len(self.svals)): self.assert_(self.svals[i] < self.svals[i-1]) def test_reconstructed(self): pass # TODO def test_orthonormality(self): assertTensorEqual(self.u.T * self.u, numpy.eye(self.u.shape[1])) assertTensorEqual(self.v.T * self.v, numpy.eye(self.u.shape[1])) def test_variance(self): return # TODO # Assert that the SVD explained some of the variance. diff_k3 = self.tensor - self.svd.reconstructed tensor_mag = self.tensor.magnitude() diff_k3_mag = diff_k3.magnitude() self.assert_(tensor_mag > diff_k3_mag) # Check that a smaller SVD explains less of the variance, but still some. svd_k1 = self.tensor.svd(k=1) diff_k1 = self.tensor - svd_k1.reconstructed diff_k1_mag = diff_k1.magnitude() self.assert_(tensor_mag > diff_k1_mag > diff_k3_mag)
class SVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3, offset_for_row=offset_for_row, offset_for_col=offset_for_col) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.u, [[0, 0, 1], [0, -1, 0], [0, 0, 0], [-1, 0, 0]], abs=True) assertTensorEqual(self.v, [[0, 0, sqrt(.2)], [-1, 0, 0], [0, -1, 0], [0, 0, 0], [0, 0, sqrt(.8)]], abs=True) assertTensorEqual(self.svals, [4, 3, sqrt(5)]) def test_reconstructed(self): assertTensorEqual(self.svd.reconstructed, [[1, 0, 0, 0, 2], [0, 0, 3, 0, 0], [0, 0, 0, 0, 0], [0, 4, 0, 0, 0]]) assertTensorEqual(self.svd.reconstructed[1, :], [0, 0, 3, 0, 0]) assertTensorEqual(self.svd.reconstructed[:, 2], [0, 3, 0, 0]) def test_orthonormality(self): identity = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] assertTensorEqual(self.u.T * self.u, identity) assertTensorEqual(self.v.T * self.v, identity)
def build_tensor(self, tensor=None): ''' Build the combined tensor. Done explicitly because it's slow. If `tensor` is not None, it is used as the underlying numeric storage tensor. It should have the same number of dimensions as the blend. It defaults to a new DictTensor. ''' self.logger.info('building combined tensor.') labels = self._labels if tensor is None: tensor = DictTensor(ndim=self.ndim) assert tensor.ndim == self.ndim if self._keys_never_overlap: self.logger.info('fast-merging.') tensor.update((tuple(label_list.index(label) for label_list, label in izip(labels, key)), val) for key, val in self._fast_iteritems()) else: for factor, cur_tensor, name in zip(self._weights, self._tensors, self.names): self.logger.info('slow-merging %s' % name) for key, val in cur_tensor.iteritems(): tensor.inc(tuple(label_list.index(label) for label_list, label in izip(labels, key)), factor*val) self._tensor = tensor self.logger.info('done building tensor.')
def test_DictMatrixMatrixDot(): # Numbers computed using numpy separately (and checked by hand)... A = DictTensor(2) B = DictTensor(2) A.update({ (0, 0): 0.97878770132160475, (0, 1): 0.38968165255179188, (0, 2): 0.62726841877492023, (1, 0): 0.077757604769237876, (1, 1): 0.081345677776447523, (1, 2): 0.64136810022648949 }) B.update({ (0, 0): 0.062059208836173663, (0, 1): 0.67286767409459525, (0, 2): 0.55410453533854442, (0, 3): 0.74671274663041698, (1, 0): 0.11565332983247767, (1, 1): 0.48262692547766795, (1, 2): 0.76280138705455269, (1, 3): 0.50230554417370143, (2, 0): 0.67149114912362429, (2, 1): 0.7656884479264322, (2, 2): 0.69286881606948747, (2, 3): 0.82598232206483091 }) test_result = { (0, 0): 0.52701596238696313, (0, 1): 1.3269576439118278, (0, 2): 1.2742151361864653, (0, 3): 1.4447251324591062, (1, 0): 0.444906476567622, (1, 1): 0.58266833824233299, (1, 2): 0.54952039356712779, (1, 3): 0.62868169229370208 } result = A * B for key, value in result.iteritems(): assert_almost_equal(value, test_result[key])
def test_combine_by_element(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[1, 1] = 1 t1[1, 0] = 2 t2[1, 1] = 4 t2[0, 1] = 5 t3 = t1.combine_by_element(t2, lambda x, y: x + (2 * y)) assertTensorEqual(t3, [[None, 10], [2, 9]]) # Make sure errors are raised when the tensors don't have the # same shape or number of dimensions t4 = DictTensor(2) t4[0, 2] = 3 t4[1, 0] = 5 self.assertRaises( IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) t4 = DictTensor(3) self.assertRaises( IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y))
def testDictDotProduct(): tensor = DictTensor(1) tensor.update({ 1: 0.06198828, 3: 0.24177249, 6: 0.5256805, 7: 0.46505895, 8: 0.27791615, 9: 0.02906779 }) tensor2 = DictTensor(1) tensor2.update({ 0: 0.2502674, 2: 0.34907184, 3: 0.2209139, 5: 0.45788618, 6: 0.37133328, 7: 0.48278861 }) result = tensor * tensor2 assert_almost_equal(result, 0.473138731464)
def test_tensordot(self): if True: # FIXME XXX: skip this test. return # Test degenerate case of two 1-d vectors t1 = DictTensor(ndim=1) t2 = DictTensor(ndim=1) t1[0] = 1 t1[2] = 2 t2[0] = 3 t2[1] = 4 t2[2] = 5 self.assertEqual(13, t1.tensordot(t2, 0)) self.assertEqual(13, t1.tensordot(t2.to_dense(), 0)) self.assertEqual(13, t1.to_dense().tensordot(t2, 0)) self.assertEqual(13, t1.to_dense().tensordot(t2.to_dense(), 0)) for i in range(5): # Make a random, randomly-shaped 3D tensor shape = random.sample(xrange(1,30), 3) tensor = DenseTensor(numpy.random.random(shape)) # Pick a random one of those dimensions dim = random.randrange(3) # Make a random vector of that length vec = DenseTensor(numpy.random.random((shape[dim],))) # Try the dense result result = tensor.tensordot(vec, dim) self.assertEqual(result.shape, tuple(shape[:dim]+shape[dim+1:])) # Try it with the tensor being sparse. sparset = tensor.to_sparse() result_s = sparset.tensordot(vec, dim) self.assertEqual(result_s.shape, result.shape) for key, val in result.iteritems(): self.assertAlmostEqual(val, result_s[key])
def test_DictMatrixVectorDot(): # Numbers computed using numpy separately (and checked by hand)... A = DictTensor(2) b = DictTensor(1) A.update({ (0, 0): 0.18850744743616121, (0, 1): 0.64380371397047509, (1, 0): 0.40673500155569442, (1, 1): 0.77961381386745443, (2, 0): 0.38745898104117782, (2, 1): 0.39479530812173591 }) b.update({0: 0.95308634444417639, 1: 0.41483520394218798}) test_result = { (0, ): 0.44673631896111365, (1, ): 0.71106483126206554, (2, ): 0.53305685602270081 } result = A * b for k, value in result.iteritems(): assert_almost_equal(value, test_result[k])
def make_sparse_labeled_tensor(ndim, labels=None, initial=None, accumulate=None, normalize=False): ''' Create a sparse labeled tensor. ndim: number of dimensions (usually 2) labels: if you already have label lists, pass them in here. (A None in this list means an unlabeled dimension. If you simply don't have labels yet, pass an OrderedSet().) initial / accumulate: sequences of (key, value) pairs to add to the tensor. ``initial`` is applied first by ``.update``, meaning that later values will override earlier ones. ``accumulate`` is applied afterwards, and all values add to anything already there. normalize: an int or tuple of ints: normalize along that dimension True: normalize along axis 0 'tfidf': use tf-idf 'tfidf.T': use tf-idf, transposed (matrix is documents by terms) a class: adds that class as a layer. ''' if labels is None: labels = [OrderedSet() for _ in xrange(ndim)] tensor = LabeledView(DictTensor(ndim), labels) tensor.tensor._shape[:] = [len(label_list) for label_list in labels] if initial is not None: tensor.update(initial) for k, v in accumulate or []: tensor.inc(k, v) if normalize: return tensor.normalized(normalize) else: return tensor
def test_DictMatrixVectorDot(): # Numbers computed using numpy separately (and checked by hand)... A = DictTensor(2) b = DictTensor(1) A.update({(0, 0): 0.18850744743616121, (0, 1): 0.64380371397047509, (1, 0): 0.40673500155569442, (1, 1): 0.77961381386745443, (2, 0): 0.38745898104117782, (2, 1): 0.39479530812173591}) b.update({0: 0.95308634444417639, 1: 0.41483520394218798}) test_result = {(0,): 0.44673631896111365, (1,): 0.71106483126206554, (2,): 0.53305685602270081} result = A * b for k, value in result.iteritems(): assert_almost_equal(value, test_result[k])
def testDictDotProduct(): tensor = DictTensor(1) tensor.update({ 1: 0.06198828, 3: 0.24177249, 6: 0.5256805, 7: 0.46505895, 8: 0.27791615, 9: 0.02906779}) tensor2 = DictTensor(1) tensor2.update({ 0: 0.2502674, 2: 0.34907184, 3: 0.2209139, 5: 0.45788618, 6: 0.37133328, 7: 0.48278861}) result = tensor * tensor2 assert_almost_equal(result, 0.473138731464)
def test_DictMatrixMatrixDot(): # Numbers computed using numpy separately (and checked by hand)... A = DictTensor(2) B = DictTensor(2) A.update({(0, 0): 0.97878770132160475, (0, 1): 0.38968165255179188, (0, 2): 0.62726841877492023, (1, 0): 0.077757604769237876, (1, 1): 0.081345677776447523, (1, 2): 0.64136810022648949}) B.update({(0, 0): 0.062059208836173663, (0, 1): 0.67286767409459525, (0, 2): 0.55410453533854442, (0, 3): 0.74671274663041698, (1, 0): 0.11565332983247767, (1, 1): 0.48262692547766795, (1, 2): 0.76280138705455269, (1, 3): 0.50230554417370143, (2, 0): 0.67149114912362429, (2, 1): 0.7656884479264322, (2, 2): 0.69286881606948747, (2, 3): 0.82598232206483091}) test_result = {(0, 0): 0.52701596238696313, (0, 1): 1.3269576439118278, (0, 2): 1.2742151361864653, (0, 3): 1.4447251324591062, (1, 0): 0.444906476567622, (1, 1): 0.58266833824233299, (1, 2): 0.54952039356712779, (1, 3): 0.62868169229370208} result = A * B for key, value in result.iteritems(): assert_almost_equal(value, test_result[key])
def setUp(self): self.tensor = DictTensor(2)
def __init__(self): # FIXME: yes this saves space, but it might make a row or column be zero. concepts, relations = OrderedSet(), OrderedSet() super(ConceptRelationConceptTensor, self).__init__( DictTensor(3), [concepts, relations, concepts])
from csc.divisi.tensor import DictTensor from csc.divisi.normalized_view import NormalizedView from nose.tools import raises, assert_almost_equal from tensor_util import assertTensorEqual, nones_removed, nested_list_to_dict normalize_testcase = [[1, None], [3, 4]] normalize_expected_result = [[1, None], [3 / 5., 4 / 5.]] raw = DictTensor(2) raw.update(nones_removed(nested_list_to_dict(normalize_testcase))) tensor = NormalizedView(raw, 0) def test_result(): assertTensorEqual(tensor, normalize_expected_result) def test_contains(): assert (0, 0) in tensor assert tensor.has_key((0, 0)) assert (0, 1) not in tensor assert not tensor.has_key((0, 1)) def test_unnormalize(): assert_almost_equal(tensor[1, 0], 3 / 5.) assert_almost_equal(tensor.unnormalized()[1, 0], 3) def test_labeled_unnormalize():
import numpy as np import unittest from nose.tools import eq_, raises from math import sqrt from csc.divisi.tensor import DictTensor from csc.divisi.util import nested_list_to_dict from tensor_util import assertTensorEqual, zeros_removed data = np.array([[1, 2, 3, 4], [-1,2, 3, 4], [0, 1, -1,0]]) tensor = DictTensor(2) tensor.update(zeros_removed(nested_list_to_dict(data))) eq_(len(tensor), 10) # For NumPy, "along an axis" means something different. ms_data = data - data.mean(1)[:,np.newaxis] ms_tensor = DictTensor(2) ms_tensor.update(nested_list_to_dict(ms_data)) def test_means(): means = tensor.means() eq_(len(means), 2) assert np.allclose(means[0], [(1+2+3+4)/4., (-1+2+3+4)/4., (0+1+-1+0)/4.]) assert np.allclose(means[1], [0, (2+2+1)/3., (3+3-1)/3., (4+4+0)/3.]) def test_mean_subtracted(): mean_subtracted = tensor.mean_subtracted() m = np.zeros(data.shape) for (r, c), v in mean_subtracted.iteritems():
def test_1D(self): tensor_1D = DictTensor(1) tensor_1D[2] = 1 assertTensorEqual(tensor_1D, [None, None, 1])
class DictTensorTest(unittest.TestCase): slice_testcase = [[1, None, None], [None, 2, 3], [4, None, None], [None, 5, None]] def test_initial(self): self.assertEqual(len(self.tensor), 0) self.assertEqual(len(self.tensor.keys()), 0) assert_dims_consistent(self.tensor) self.assertEqual(self.tensor.shape, (0, 0)) assert isinstance(self.tensor[4, 5], (float, int, long)) self.assertEqual(self.tensor[5, 5], 0) self.assertEqual(self.tensor[2, 7], 0) def test_storage(self): self.tensor[5, 5] = 1 self.tensor[2, 7] = 2 assertTensorEqual( self.tensor, [[None] * 8, [None] * 8, [None] * 7 + [2], [None] * 8, [None] * 8, [None] * 5 + [1, None, None]]) def test_slice(self): self.tensor.update( nones_removed(nested_list_to_dict(self.slice_testcase))) # Test end conditions: start index # is included in slice, end index is not slice = self.tensor[1:3, 0:2] assertTensorEqual(slice, [[None, 2], [4, None]]) # Test that slicing on some dims correctly # reduces the dimensionality of the tensor slice = self.tensor[3, :] assertTensorEqual(slice, [None, 5, None]) # Test the step parameter slice = self.tensor[1:4:2, :] assertTensorEqual(slice, [[None, 2, 3], [None, 5, None]]) def test_transpose(self): self.tensor[0, 0] = 1 self.tensor[1, 2] = 3 self.tensor[2, 0] = 4 self.tensor[3, 1] = 5 t = self.tensor.transpose() assertTensorEqual( t, [[1, None, 4, None], [None, None, None, 5], [None, 3, None, None]]) def test_delete(self): self.tensor.update( nones_removed(nested_list_to_dict(self.slice_testcase))) assertTensorEqual(self.tensor, self.slice_testcase) del self.tensor[0, 0] assertTensorEqual(self.tensor, [[None, None, None], [None, 2, 3], [4, None, None], [None, 5, None]]) def test_contains(self): self.tensor[1, 2] = 1 self.tensor[4, 5] = 2 self.assertTrue((1, 2) in self.tensor) self.assertTrue(self.tensor.has_key((1, 2))) self.assertFalse((4, 2) in self.tensor) self.assertFalse((1, 5) in self.tensor) def setUp(self): self.tensor = DictTensor(2) def test_1D(self): tensor_1D = DictTensor(1) tensor_1D[2] = 1 assertTensorEqual(tensor_1D, [None, None, 1]) def test_combine_by_element(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[1, 1] = 1 t1[1, 0] = 2 t2[1, 1] = 4 t2[0, 1] = 5 t3 = t1.combine_by_element(t2, lambda x, y: x + (2 * y)) assertTensorEqual(t3, [[None, 10], [2, 9]]) # Make sure errors are raised when the tensors don't have the # same shape or number of dimensions t4 = DictTensor(2) t4[0, 2] = 3 t4[1, 0] = 5 self.assertRaises( IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) t4 = DictTensor(3) self.assertRaises( IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) def testAdd(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[0, 0] = 1 t1[1, 1] = 1 t1[1, 0] = 2 t2[2, 1] = 4 t2[1, 0] = 5 t3 = t1 + t2 assertTensorEqual(t3, [[1, None], [7, 1], [None, 4]]) def testICmul(self): t1 = tensor_from_nested_list([[1, 2], [3, 4]]) assertTensorEqual(t1, [[1, 2], [3, 4]]) t1 *= 2 assertTensorEqual(t1, [[2, 4], [6, 8]]) def testICdiv(self): t1 = tensor_from_nested_list([[2, 4], [6, 8]]) t1 /= 2 assertTensorEqual(t1, [[1, 2], [3, 4]]) def testReprOfEmpty(self): repr(self.tensor) self.tensor.example_key() def testNorm(self): norm_test = [[0, 0, 0], [0, 1, 0], [0, 5.0, 0]] self.tensor.update(nested_list_to_dict(norm_test)) self.assertEqual(self.tensor.norm(), sqrt(26.0)) self.assertEqual(self.tensor.magnitude(), sqrt(26.0))
from csc.divisi.tensor import DictTensor from csc.divisi.normalized_view import NormalizedView from nose.tools import raises, assert_almost_equal from tensor_util import assertTensorEqual, nones_removed, nested_list_to_dict normalize_testcase = [[1, None], [3, 4]] normalize_expected_result = [[1, None], [3/5., 4/5.]] raw = DictTensor(2) raw.update(nones_removed(nested_list_to_dict(normalize_testcase))) tensor = NormalizedView(raw, 0) def test_result(): assertTensorEqual(tensor, normalize_expected_result) def test_contains(): assert (0,0) in tensor assert tensor.has_key((0,0)) assert (0,1) not in tensor assert not tensor.has_key((0,1)) def test_unnormalize(): assert_almost_equal(tensor[1,0], 3/5.) assert_almost_equal(tensor.unnormalized()[1,0], 3) def test_labeled_unnormalize(): labeled = tensor.labeled([['a','b'],['A','B']]) assert_almost_equal(labeled['b','A'], 3/5.)
class SVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3) self.incremental = self.tensor.incremental_svd(k=3, niter=200) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_incremental(self): self.assertEqual(self.incremental.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.incremental.svals), self.incremental.u.shape[1]) self.assertEqual(len(self.incremental.svals), self.incremental.v.shape[1]) self.assertEqual(self.incremental.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.incremental.u, [[0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 0, 0]]) assertTensorEqual(self.incremental.v, [[0, 0, sqrt(.2)], [1, 0, 0], [0, 1, 0], [0, 0, 0], [0, 0, sqrt(.8)]]) assertTensorEqual(self.incremental.svals, [4, 3, sqrt(5)]) def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.u, [[0, 0, 1], [0, -1, 0], [0, 0, 0], [-1, 0, 0]], abs=True) assertTensorEqual(self.v, [[0, 0, sqrt(.2)], [-1, 0, 0], [0, -1, 0], [0, 0, 0], [0, 0, sqrt(.8)]], abs=True) assertTensorEqual(self.svals, [4, 3, sqrt(5)]) def test_reconstructed(self): assertTensorEqual(self.svd.reconstructed, [[1, 0, 0, 0, 2], [0, 0, 3, 0, 0], [0, 0, 0, 0, 0], [0, 4, 0, 0, 0]]) assertTensorEqual(self.svd.reconstructed[1,:], [0, 0, 3, 0, 0]) assertTensorEqual(self.svd.reconstructed[:,2], [0, 3, 0, 0]) def test_orthonormality(self): identity = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] assertTensorEqual(self.u.T * self.u, identity) assertTensorEqual(self.v.T * self.v, identity) def test_variance(self): # Assert that the SVD explained some of the variance. diff_k3 = self.tensor - self.svd.reconstructed tensor_mag = self.tensor.magnitude() diff_k3_mag = diff_k3.magnitude() self.assert_(tensor_mag > diff_k3_mag) # Check that a smaller SVD explains less of the variance, but still some. svd_k1 = self.tensor.svd(k=1) diff_k1 = self.tensor - svd_k1.reconstructed diff_k1_mag = diff_k1.magnitude() self.assert_(tensor_mag > diff_k1_mag > diff_k3_mag)
import numpy as np import unittest from nose.tools import eq_, raises from math import sqrt from csc.divisi.tensor import DictTensor from csc.divisi.util import nested_list_to_dict from tensor_util import assertTensorEqual, zeros_removed data = np.array([[1, 2, 3, 4], [-1, 2, 3, 4], [0, 1, -1, 0]]) tensor = DictTensor(2) tensor.update(zeros_removed(nested_list_to_dict(data))) eq_(len(tensor), 10) # For NumPy, "along an axis" means something different. ms_data = data - data.mean(1)[:, np.newaxis] ms_tensor = DictTensor(2) ms_tensor.update(nested_list_to_dict(ms_data)) def test_means(): means = tensor.means() eq_(len(means), 2) assert np.allclose(means[0], [(1 + 2 + 3 + 4) / 4., (-1 + 2 + 3 + 4) / 4., (0 + 1 + -1 + 0) / 4.]) assert np.allclose( means[1], [0, (2 + 2 + 1) / 3., (3 + 3 - 1) / 3., (4 + 4 + 0) / 3.]) def test_mean_subtracted(): mean_subtracted = tensor.mean_subtracted()
def setUp(self): self.raw = DictTensor(3) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.raw[x1, x2, x3] = x1*100+x2*10+x3
def __init__(self): super(FeatureByConceptMatrix, self).__init__( DictTensor(2), [OrderedSet() for _ in '01'])
class DictTensorTest(unittest.TestCase): slice_testcase = [[1, None, None], [None, 2, 3 ], [4, None, None], [None, 5, None]] def test_initial(self): self.assertEqual(len(self.tensor), 0) self.assertEqual(len(self.tensor.keys()), 0) assert_dims_consistent(self.tensor) self.assertEqual(self.tensor.shape, (0, 0)) assert isinstance(self.tensor[4, 5], (float, int, long)) self.assertEqual(self.tensor[5, 5], 0) self.assertEqual(self.tensor[2, 7], 0) def test_storage(self): self.tensor[5, 5] = 1 self.tensor[2, 7] = 2 assertTensorEqual(self.tensor, [[None]*8, [None]*8, [None]*7 + [2], [None]*8, [None]*8, [None]*5 + [1, None, None]]) def test_slice(self): self.tensor.update(nones_removed(nested_list_to_dict(self.slice_testcase))) # Test end conditions: start index # is included in slice, end index is not slice = self.tensor[1:3, 0:2] assertTensorEqual(slice, [[None, 2], [4, None]]) # Test that slicing on some dims correctly # reduces the dimensionality of the tensor slice = self.tensor[3, :] assertTensorEqual(slice, [None, 5, None]) # Test the step parameter slice = self.tensor[1:4:2, :] assertTensorEqual(slice, [[None, 2, 3], [None, 5, None]]) def test_transpose(self): self.tensor[0, 0] = 1 self.tensor[1, 2] = 3 self.tensor[2, 0] = 4 self.tensor[3, 1] = 5 t = self.tensor.transpose() assertTensorEqual(t, [[1, None, 4, None], [None, None, None, 5], [None, 3, None, None]]) def test_delete(self): self.tensor.update(nones_removed(nested_list_to_dict(self.slice_testcase))) assertTensorEqual(self.tensor, self.slice_testcase) del self.tensor[0,0] assertTensorEqual(self.tensor, [[None, None, None], [None, 2, 3 ], [4, None, None], [None, 5, None]]) def test_contains(self): self.tensor[1,2] = 1 self.tensor[4,5] = 2 self.assertTrue((1,2) in self.tensor) self.assertTrue(self.tensor.has_key((1,2))) self.assertFalse((4,2) in self.tensor) self.assertFalse((1,5) in self.tensor) def setUp(self): self.tensor = DictTensor(2) def test_1D(self): tensor_1D = DictTensor(1) tensor_1D[2] = 1 assertTensorEqual(tensor_1D, [None, None, 1]) def test_combine_by_element(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[1, 1] = 1 t1[1, 0] = 2 t2[1, 1] = 4 t2[0, 1] = 5 t3 = t1.combine_by_element(t2, lambda x, y: x + (2*y)) assertTensorEqual(t3, [[None, 10], [2, 9]]) # Make sure errors are raised when the tensors don't have the # same shape or number of dimensions t4 = DictTensor(2) t4[0, 2] = 3 t4[1, 0] = 5 self.assertRaises(IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) t4 = DictTensor(3) self.assertRaises(IndexError, lambda: t1.combine_by_element(t4, lambda x, y: x + y)) def testAdd(self): t1 = DictTensor(2) t2 = DictTensor(2) t1[0, 0] = 1 t1[1, 1] = 1 t1[1, 0] = 2 t2[2, 1] = 4 t2[1, 0] = 5 t3 = t1 + t2 assertTensorEqual(t3, [[1, None], [7, 1], [None, 4]]) def testICmul(self): t1 = tensor_from_nested_list([[1, 2], [3, 4]]) assertTensorEqual(t1, [[1, 2], [3, 4]]) t1 *= 2 assertTensorEqual(t1, [[2, 4], [6, 8]]) def testICdiv(self): t1 = tensor_from_nested_list([[2, 4], [6, 8]]) t1 /= 2 assertTensorEqual(t1, [[1, 2], [3, 4]]) def testReprOfEmpty(self): repr(self.tensor) self.tensor.example_key() def testNorm(self): norm_test = [[0,0,0], [0,1,0], [0,5.0,0]] self.tensor.update(nested_list_to_dict(norm_test)) self.assertEqual(self.tensor.norm(), sqrt(26.0)) self.assertEqual(self.tensor.magnitude(), sqrt(26.0))
class SVD2DTest(unittest.TestCase): def setUp(self): self.tensor = DictTensor(2) # Note: this command actually puts 20 values in tensor! self.tensor.update(nested_list_to_dict(svd_2d_test_matrix)) self.svd = self.tensor.svd(k=3) self.incremental = self.tensor.incremental_svd(k=3, niter=200) self.u, self.svals, self.v = self.svd.u, self.svd.svals, self.svd.v def test_incremental(self): self.assertEqual(self.incremental.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.incremental.svals), self.incremental.u.shape[1]) self.assertEqual(len(self.incremental.svals), self.incremental.v.shape[1]) self.assertEqual(self.incremental.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.incremental.u, [[0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 0, 0]]) assertTensorEqual(self.incremental.v, [[0, 0, sqrt(.2)], [1, 0, 0], [0, 1, 0], [0, 0, 0], [0, 0, sqrt(.8)]]) assertTensorEqual(self.incremental.svals, [4, 3, sqrt(5)]) def test_decomposition(self): self.assertEqual(self.u.shape[0], self.tensor.shape[0]) self.assertEqual(len(self.svals), self.u.shape[1]) self.assertEqual(len(self.svals), self.v.shape[1]) self.assertEqual(self.v.shape[0], self.tensor.shape[1]) assertTensorEqual(self.u, [[0, 0, 1], [0, -1, 0], [0, 0, 0], [-1, 0, 0]], abs=True) assertTensorEqual(self.v, [[0, 0, sqrt(.2)], [-1, 0, 0], [0, -1, 0], [0, 0, 0], [0, 0, sqrt(.8)]], abs=True) assertTensorEqual(self.svals, [4, 3, sqrt(5)]) def test_reconstructed(self): assertTensorEqual(self.svd.reconstructed, [[1, 0, 0, 0, 2], [0, 0, 3, 0, 0], [0, 0, 0, 0, 0], [0, 4, 0, 0, 0]]) assertTensorEqual(self.svd.reconstructed[1, :], [0, 0, 3, 0, 0]) assertTensorEqual(self.svd.reconstructed[:, 2], [0, 3, 0, 0]) def test_orthonormality(self): identity = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] assertTensorEqual(self.u.T * self.u, identity) assertTensorEqual(self.v.T * self.v, identity) def test_variance(self): # Assert that the SVD explained some of the variance. diff_k3 = self.tensor - self.svd.reconstructed tensor_mag = self.tensor.magnitude() diff_k3_mag = diff_k3.magnitude() self.assert_(tensor_mag > diff_k3_mag) # Check that a smaller SVD explains less of the variance, but still some. svd_k1 = self.tensor.svd(k=1) diff_k1 = self.tensor - svd_k1.reconstructed diff_k1_mag = diff_k1.magnitude() self.assert_(tensor_mag > diff_k1_mag > diff_k3_mag)
def test_oob(self): self.assertRaises(IndexError, lambda: DictTensor(3).unfolded(3))
def setUp(self): self.raw = DictTensor(3) for x1 in range(2): for x2 in range(3): for x3 in range(4): self.raw[x1, x2, x3] = x1 * 100 + x2 * 10 + x3