def testWhitening(self): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = mt.dot( rng.randn(n_samples, rank), mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features))) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 self.assertEqual(X.shape, (n_samples, n_features)) # the component-wise variance is thus highly varying: self.assertGreater(X.std(axis=0).std().to_numpy(), 43.8) for solver, copy in product(self.solver_list, (True, False)): # whiten the data while projecting to the lower dim subspace X_ = X.copy() # make sure we keep an original across iterations. pca = PCA(n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=7) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) self.assertEqual(X_whitened.shape, (n_samples, n_components)) X_whitened2 = pca.transform(X_) assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch()) assert_almost_equal(X_whitened.std(ddof=1, axis=0).to_numpy(), np.ones(n_components), decimal=6) assert_almost_equal( X_whitened.mean(axis=0).to_numpy(), np.zeros(n_components)) X_ = X.copy() pca = PCA(n_components=n_components, whiten=False, copy=copy, svd_solver=solver).fit(X_) X_unwhitened = pca.transform(X_) self.assertEqual(X_unwhitened.shape, (n_samples, n_components)) # in that case the output components still have varying variances assert_almost_equal( X_unwhitened.std(axis=0).std().to_numpy(), 74.1, 1)
def testPCA(self): X = self.iris for n_comp in np.arange(X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='full') X_r = pca.fit(X).transform(X).fetch() np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X).fetch() assert_array_almost_equal(X_r, X_r2) X_r = pca.transform(X).fetch() X_r2 = pca.fit_transform(X).fetch() assert_array_almost_equal(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal( mt.dot(cov, precision).to_numpy(), np.eye(X.shape[1]), 12) # test explained_variance_ratio_ == 1 with all components pca = PCA(svd_solver='full') pca.fit(X) np.testing.assert_allclose( pca.explained_variance_ratio_.sum().to_numpy(), 1.0, 3)
def test_pca_randomized_solver(setup): # PCA on dense arrays X = iris # Loop excluding the 0, invalid for randomized for n_comp in np.arange(1, X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0) X_r = pca.fit(X).transform(X) np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X) assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) X_r = pca.transform(X) assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal(mt.dot(cov, precision).to_numpy(), mt.eye(X.shape[1]).to_numpy(), 12) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) with pytest.raises(ValueError): pca.fit(X) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) with pytest.raises(ValueError): pca.fit(X) # Check internal state assert pca.n_components == PCA(n_components=0, svd_solver='randomized', random_state=0).n_components assert pca.svd_solver == PCA(n_components=0, svd_solver='randomized', random_state=0).svd_solver
def testTensordot(self): from mars.tensor.linalg import tensordot, dot, inner t1 = ones((3, 4, 6), chunk_size=2) t2 = ones((4, 3, 5), chunk_size=2) t3 = tensordot(t1, t2, axes=((0, 1), (1, 0))) self.assertEqual(t3.shape, (6, 5)) t3.tiles() self.assertEqual(t3.shape, (6, 5)) self.assertEqual(len(t3.chunks), 9) a = ones((10000, 20000), chunk_size=5000) b = ones((20000, 1000), chunk_size=5000) with self.assertRaises(ValueError): tensordot(a, b) a = ones(10, chunk_size=2) b = ones((10, 20), chunk_size=2) c = dot(a, b) self.assertEqual(c.shape, (20,)) c.tiles() self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits)) a = ones((10, 20), chunk_size=2) b = ones(20, chunk_size=2) c = dot(a, b) self.assertEqual(c.shape, (10,)) c.tiles() self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits)) v = ones((100, 100), chunk_size=10) tv = v.dot(v) self.assertEqual(tv.shape, (100, 100)) tv.tiles() self.assertEqual(tv.shape, tuple(sum(s) for s in tv.nsplits)) a = ones((10, 20), chunk_size=2) b = ones((30, 20), chunk_size=2) c = inner(a, b) self.assertEqual(c.shape, (10, 30)) c.tiles() self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits))
def test_tensordot(): from mars.tensor.linalg import tensordot, dot, inner t1 = ones((3, 4, 6), chunk_size=2) t2 = ones((4, 3, 5), chunk_size=2) t3 = tensordot(t1, t2, axes=((0, 1), (1, 0))) assert t3.shape == (6, 5) t3 = tile(t3) assert t3.shape == (6, 5) assert len(t3.chunks) == 9 a = ones((10000, 20000), chunk_size=5000) b = ones((20000, 1000), chunk_size=5000) with pytest.raises(ValueError): tensordot(a, b) a = ones(10, chunk_size=2) b = ones((10, 20), chunk_size=2) c = dot(a, b) assert c.shape == (20, ) c = tile(c) assert c.shape == tuple(sum(s) for s in c.nsplits) a = ones((10, 20), chunk_size=2) b = ones(20, chunk_size=2) c = dot(a, b) assert c.shape == (10, ) c = tile(c) assert c.shape == tuple(sum(s) for s in c.nsplits) v = ones((100, 100), chunk_size=10) tv = v.dot(v) assert tv.shape == (100, 100) tv = tile(tv) assert tv.shape == tuple(sum(s) for s in tv.nsplits) a = ones((10, 20), chunk_size=2) b = ones((30, 20), chunk_size=2) c = inner(a, b) assert c.shape == (10, 30) c = tile(c) assert c.shape == tuple(sum(s) for s in c.nsplits)
def test_singular_values(self): # Check that the PCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1) # Compare to the Frobenius norm X_pca = pca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal( mt.sum(pca.singular_values_**2.0).to_numpy(), (mt.linalg.norm(X_pca, "fro")**2.0).to_numpy(), 12) assert_array_almost_equal( mt.sum(rpca.singular_values_**2.0).to_numpy(), (mt.linalg.norm(X_rpca, "fro")**2.0).to_numpy(), 0) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_.fetch(), mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(), 12) assert_array_almost_equal( rpca.singular_values_.fetch(), mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=3, svd_solver='full', random_state=rng) rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) X_pca = pca.fit_transform(X) X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = mt.dot(X_pca, pca.components_) pca.fit(X_hat) rpca.fit(X_hat) assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
def testDot(self): t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() t2 = t1.T self.assertTrue(t1.dot(t2).issparse()) self.assertIs(type(t1.dot(t2)), SparseTensor) self.assertFalse(t1.dot(t2, sparse=False).issparse()) self.assertIs(type(t1.dot(t2, sparse=False)), Tensor) with self.assertRaises(TypeError): dot(t1, t2, out=1) with self.assertRaises(ValueError): dot(t1, t2, empty((3, 6))) with self.assertRaises(ValueError): dot(t1, t2, empty((3, 3), dtype='i4')) with self.assertRaises(ValueError): dot(t1, t2, empty((3, 3), order='F')) t1.dot(t2, out=empty((2, 2), dtype=t1.dtype))
def test_dot(): t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse() t2 = t1.T assert t1.dot(t2).issparse() is True assert type(t1.dot(t2)) is SparseTensor assert t1.dot(t2, sparse=False).issparse() is False assert type(t1.dot(t2, sparse=False)) is Tensor with pytest.raises(TypeError): dot(t1, t2, out=1) with pytest.raises(ValueError): dot(t1, t2, empty((3, 6))) with pytest.raises(ValueError): dot(t1, t2, empty((3, 3), dtype='i4')) with pytest.raises(ValueError): dot(t1, t2, empty((3, 3), order='F')) t1.dot(t2, out=empty((2, 2), dtype=t1.dtype))
def test_singular_values(setup): # Check that the TruncatedSVD output has the correct singular values # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = rng.randn(n_samples, n_features) rpca = TruncatedSVD(n_components=3, algorithm='randomized', random_state=rng) X_rpca = rpca.fit_transform(X) X_rpca /= mt.sqrt(mt.sum(X_rpca**2.0, axis=0)) X_rpca[:, 0] *= 3.142 X_rpca[:, 1] *= 2.718 X_hat_rpca = mt.dot(X_rpca, rpca.components_) rpca.fit(X_hat_rpca) assert_array_almost_equal(rpca.singular_values_.to_numpy(), [3.142, 2.718, 1.0], 14)
def testChunkSerialize(self): t = ones((10, 3), chunk_size=(5, 2)).tiles() # pb chunk = t.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.TENSOR_ONES) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.dtype, chunk2.op.dtype) # json chunk = t.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.dtype, chunk2.op.dtype) t = tensor(np.random.random((10, 3)), chunk_size=(5, 2)).tiles() # pb chunk = t.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.TENSOR_DATA_SOURCE) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data)) # json chunk = t.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data)) t = (tensor(np.random.random((10, 3)), chunk_size=(5, 2)) + 1).tiles() # pb chunk1 = t.chunks[0] chunk2 = t.chunks[1] fuse_op = TensorFuseChunk() composed_chunk = fuse_op.new_chunk( chunk1.inputs, shape=chunk2.shape, _key=chunk2.key, _composed=[chunk1.data, chunk2.data]) serials = self._pb_serial(composed_chunk) op, pb = serials[composed_chunk.op, composed_chunk.data] self.assertEqual(pb.key, composed_chunk.key) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.FUSE) self.assertEqual(len(pb.composed), 2) composed_chunk2 = self._pb_deserial(serials)[composed_chunk.data] self.assertEqual(composed_chunk.key, composed_chunk2.key) self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op)) self.assertEqual(composed_chunk.composed[0].inputs[0].key, composed_chunk2.composed[0].inputs[0].key) self.assertEqual(composed_chunk.inputs[-1].key, composed_chunk2.inputs[-1].key) # json chunk1 = t.chunks[0] chunk2 = t.chunks[1] fuse_op = TensorFuseChunk() composed_chunk = fuse_op.new_chunk( chunk1.inputs, shape=chunk2.shape, _key=chunk2.key, _composed=[chunk1.data, chunk2.data]) serials = self._json_serial(composed_chunk) composed_chunk2 = self._json_deserial(serials)[composed_chunk.data] self.assertEqual(composed_chunk.key, composed_chunk2.key) self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op)) self.assertEqual(composed_chunk.composed[0].inputs[0].key, composed_chunk2.composed[0].inputs[0].key) self.assertEqual(composed_chunk.inputs[-1].key, composed_chunk2.inputs[-1].key) t1 = ones((10, 3), chunk_size=2) t2 = ones((3, 5), chunk_size=2) c = dot(t1, t2).tiles().chunks[0].inputs[0] # pb serials = self._pb_serial(c) c2 = self._pb_deserial(serials)[c] self.assertEqual(c.key, c2.key) # json serials = self._json_serial(c) c2 = self._json_deserial(serials)[c] self.assertEqual(c.key, c2.key)
def testChunkSerialize(self): t = ones((10, 3), chunk_size=(5, 2)).tiles() # pb chunk = t.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), opcodes.TENSOR_ONES) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.dtype, chunk2.op.dtype) # json chunk = t.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.dtype, chunk2.op.dtype) t = tensor(np.random.random((10, 3)), chunk_size=(5, 2)).tiles() # pb chunk = t.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), opcodes.TENSOR_DATA_SOURCE) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data)) # json chunk = t.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data)) t1 = tensor(np.random.random((10, 3)), chunk_size=(5, 2)) t2 = (t1 + 1).tiles() # pb chunk1 = get_tiled(t1).chunks[0] chunk2 = t2.chunks[0] composed_chunk = build_fuse_chunk([chunk1.data, chunk2.data], TensorFuseChunk) serials = self._pb_serial(composed_chunk) op, pb = serials[composed_chunk.op, composed_chunk.data] self.assertEqual(pb.key, composed_chunk.key) self.assertEqual(int(op.type.split('.', 1)[1]), opcodes.FUSE) composed_chunk2 = self._pb_deserial(serials)[composed_chunk.data] self.assertEqual(composed_chunk.key, composed_chunk2.key) self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op)) self.assertEqual(composed_chunk.composed[0].key, composed_chunk2.composed[0].key) self.assertEqual(composed_chunk.composed[-1].key, composed_chunk2.composed[-1].key) # json chunk1 = get_tiled(t1).chunks[0] chunk2 = t2.chunks[0] composed_chunk = build_fuse_chunk([chunk1.data, chunk2.data], TensorFuseChunk) serials = self._json_serial(composed_chunk) composed_chunk2 = self._json_deserial(serials)[composed_chunk.data] self.assertEqual(composed_chunk.key, composed_chunk2.key) self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op)) self.assertEqual(composed_chunk.composed[0].key, composed_chunk2.composed[0].key) self.assertEqual(composed_chunk.composed[-1].key, composed_chunk2.composed[-1].key) t1 = ones((10, 3), chunk_size=2) t2 = ones((3, 5), chunk_size=2) c = dot(t1, t2).tiles().chunks[0].inputs[0] # pb serials = self._pb_serial(c) c2 = self._pb_deserial(serials)[c] self.assertEqual(c.key, c2.key) # json serials = self._json_serial(c) c2 = self._json_deserial(serials)[c] self.assertEqual(c.key, c2.key)