示例#1
0
    def testWhitening(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80
        n_components = 30
        rank = 50

        # some low rank data with correlated features
        X = mt.dot(
            rng.randn(n_samples, rank),
            mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)),
                   rng.randn(rank, n_features)))
        # the component-wise variance of the first 50 features is 3 times the
        # mean component-wise variance of the remaining 30 features
        X[:, :50] *= 3

        self.assertEqual(X.shape, (n_samples, n_features))

        # the component-wise variance is thus highly varying:
        self.assertGreater(X.std(axis=0).std().to_numpy(), 43.8)

        for solver, copy in product(self.solver_list, (True, False)):
            # whiten the data while projecting to the lower dim subspace
            X_ = X.copy()  # make sure we keep an original across iterations.
            pca = PCA(n_components=n_components,
                      whiten=True,
                      copy=copy,
                      svd_solver=solver,
                      random_state=0,
                      iterated_power=7)
            # test fit_transform
            X_whitened = pca.fit_transform(X_.copy())
            self.assertEqual(X_whitened.shape, (n_samples, n_components))
            X_whitened2 = pca.transform(X_)
            assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch())

            assert_almost_equal(X_whitened.std(ddof=1, axis=0).to_numpy(),
                                np.ones(n_components),
                                decimal=6)
            assert_almost_equal(
                X_whitened.mean(axis=0).to_numpy(), np.zeros(n_components))

            X_ = X.copy()
            pca = PCA(n_components=n_components,
                      whiten=False,
                      copy=copy,
                      svd_solver=solver).fit(X_)
            X_unwhitened = pca.transform(X_)
            self.assertEqual(X_unwhitened.shape, (n_samples, n_components))

            # in that case the output components still have varying variances
            assert_almost_equal(
                X_unwhitened.std(axis=0).std().to_numpy(), 74.1, 1)
示例#2
0
    def testPCA(self):
        X = self.iris

        for n_comp in np.arange(X.shape[1]):
            pca = PCA(n_components=n_comp, svd_solver='full')

            X_r = pca.fit(X).transform(X).fetch()
            np.testing.assert_equal(X_r.shape[1], n_comp)

            X_r2 = pca.fit_transform(X).fetch()
            assert_array_almost_equal(X_r, X_r2)

            X_r = pca.transform(X).fetch()
            X_r2 = pca.fit_transform(X).fetch()
            assert_array_almost_equal(X_r, X_r2)

            # Test get_covariance and get_precision
            cov = pca.get_covariance()
            precision = pca.get_precision()
            assert_array_almost_equal(
                mt.dot(cov, precision).to_numpy(), np.eye(X.shape[1]), 12)

        # test explained_variance_ratio_ == 1 with all components
        pca = PCA(svd_solver='full')
        pca.fit(X)
        np.testing.assert_allclose(
            pca.explained_variance_ratio_.sum().to_numpy(), 1.0, 3)
示例#3
0
文件: test_pca.py 项目: haijohn/mars
def test_pca_randomized_solver(setup):
    # PCA on dense arrays
    X = iris

    # Loop excluding the 0, invalid for randomized
    for n_comp in np.arange(1, X.shape[1]):
        pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0)

        X_r = pca.fit(X).transform(X)
        np.testing.assert_equal(X_r.shape[1], n_comp)

        X_r2 = pca.fit_transform(X)
        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())

        X_r = pca.transform(X)
        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())

        # Test get_covariance and get_precision
        cov = pca.get_covariance()
        precision = pca.get_precision()
        assert_array_almost_equal(mt.dot(cov, precision).to_numpy(),
                                  mt.eye(X.shape[1]).to_numpy(), 12)

    pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
    with pytest.raises(ValueError):
        pca.fit(X)

    pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
    with pytest.raises(ValueError):
        pca.fit(X)
    # Check internal state
    assert pca.n_components == PCA(n_components=0,
                         svd_solver='randomized', random_state=0).n_components
    assert pca.svd_solver == PCA(n_components=0,
                         svd_solver='randomized', random_state=0).svd_solver
示例#4
0
    def testTensordot(self):
        from mars.tensor.linalg import tensordot, dot, inner

        t1 = ones((3, 4, 6), chunk_size=2)
        t2 = ones((4, 3, 5), chunk_size=2)
        t3 = tensordot(t1, t2, axes=((0, 1), (1, 0)))

        self.assertEqual(t3.shape, (6, 5))

        t3.tiles()

        self.assertEqual(t3.shape, (6, 5))
        self.assertEqual(len(t3.chunks), 9)

        a = ones((10000, 20000), chunk_size=5000)
        b = ones((20000, 1000), chunk_size=5000)

        with self.assertRaises(ValueError):
            tensordot(a, b)

        a = ones(10, chunk_size=2)
        b = ones((10, 20), chunk_size=2)
        c = dot(a, b)
        self.assertEqual(c.shape, (20,))
        c.tiles()
        self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits))

        a = ones((10, 20), chunk_size=2)
        b = ones(20, chunk_size=2)
        c = dot(a, b)
        self.assertEqual(c.shape, (10,))
        c.tiles()
        self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits))

        v = ones((100, 100), chunk_size=10)
        tv = v.dot(v)
        self.assertEqual(tv.shape, (100, 100))
        tv.tiles()
        self.assertEqual(tv.shape, tuple(sum(s) for s in tv.nsplits))

        a = ones((10, 20), chunk_size=2)
        b = ones((30, 20), chunk_size=2)
        c = inner(a, b)
        self.assertEqual(c.shape, (10, 30))
        c.tiles()
        self.assertEqual(c.shape, tuple(sum(s) for s in c.nsplits))
示例#5
0
def test_tensordot():
    from mars.tensor.linalg import tensordot, dot, inner

    t1 = ones((3, 4, 6), chunk_size=2)
    t2 = ones((4, 3, 5), chunk_size=2)
    t3 = tensordot(t1, t2, axes=((0, 1), (1, 0)))

    assert t3.shape == (6, 5)

    t3 = tile(t3)

    assert t3.shape == (6, 5)
    assert len(t3.chunks) == 9

    a = ones((10000, 20000), chunk_size=5000)
    b = ones((20000, 1000), chunk_size=5000)

    with pytest.raises(ValueError):
        tensordot(a, b)

    a = ones(10, chunk_size=2)
    b = ones((10, 20), chunk_size=2)
    c = dot(a, b)
    assert c.shape == (20, )
    c = tile(c)
    assert c.shape == tuple(sum(s) for s in c.nsplits)

    a = ones((10, 20), chunk_size=2)
    b = ones(20, chunk_size=2)
    c = dot(a, b)
    assert c.shape == (10, )
    c = tile(c)
    assert c.shape == tuple(sum(s) for s in c.nsplits)

    v = ones((100, 100), chunk_size=10)
    tv = v.dot(v)
    assert tv.shape == (100, 100)
    tv = tile(tv)
    assert tv.shape == tuple(sum(s) for s in tv.nsplits)

    a = ones((10, 20), chunk_size=2)
    b = ones((30, 20), chunk_size=2)
    c = inner(a, b)
    assert c.shape == (10, 30)
    c = tile(c)
    assert c.shape == tuple(sum(s) for s in c.nsplits)
示例#6
0
    def test_singular_values(self):
        # Check that the PCA output has the correct singular values

        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  rpca.singular_values_.fetch(), 1)

        # Compare to the Frobenius norm
        X_pca = pca.transform(X)
        X_rpca = rpca.transform(X)
        assert_array_almost_equal(
            mt.sum(pca.singular_values_**2.0).to_numpy(),
            (mt.linalg.norm(X_pca, "fro")**2.0).to_numpy(), 12)
        assert_array_almost_equal(
            mt.sum(rpca.singular_values_**2.0).to_numpy(),
            (mt.linalg.norm(X_rpca, "fro")**2.0).to_numpy(), 0)

        # Compare to the 2-norms of the score vectors
        assert_array_almost_equal(
            pca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(), 12)
        assert_array_almost_equal(
            rpca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(), 2)

        # Set the singular values and see what we get back
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 110

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=3, svd_solver='full', random_state=rng)
        rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng)
        X_pca = pca.fit_transform(X)

        X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0))
        X_pca[:, 0] *= 3.142
        X_pca[:, 1] *= 2.718

        X_hat = mt.dot(X_pca, pca.components_)
        pca.fit(X_hat)
        rpca.fit(X_hat)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
        assert_array_almost_equal(rpca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
示例#7
0
    def testDot(self):
        t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
        t2 = t1.T

        self.assertTrue(t1.dot(t2).issparse())
        self.assertIs(type(t1.dot(t2)), SparseTensor)
        self.assertFalse(t1.dot(t2, sparse=False).issparse())
        self.assertIs(type(t1.dot(t2, sparse=False)), Tensor)

        with self.assertRaises(TypeError):
            dot(t1, t2, out=1)

        with self.assertRaises(ValueError):
            dot(t1, t2, empty((3, 6)))

        with self.assertRaises(ValueError):
            dot(t1, t2, empty((3, 3), dtype='i4'))

        with self.assertRaises(ValueError):
            dot(t1, t2, empty((3, 3), order='F'))

        t1.dot(t2, out=empty((2, 2), dtype=t1.dtype))
示例#8
0
def test_dot():
    t1 = tensor([[0, 1, 0], [1, 0, 0]], chunk_size=2).tosparse()
    t2 = t1.T

    assert t1.dot(t2).issparse() is True
    assert type(t1.dot(t2)) is SparseTensor
    assert t1.dot(t2, sparse=False).issparse() is False
    assert type(t1.dot(t2, sparse=False)) is Tensor

    with pytest.raises(TypeError):
        dot(t1, t2, out=1)

    with pytest.raises(ValueError):
        dot(t1, t2, empty((3, 6)))

    with pytest.raises(ValueError):
        dot(t1, t2, empty((3, 3), dtype='i4'))

    with pytest.raises(ValueError):
        dot(t1, t2, empty((3, 3), order='F'))

    t1.dot(t2, out=empty((2, 2), dtype=t1.dtype))
示例#9
0
def test_singular_values(setup):
    # Check that the TruncatedSVD output has the correct singular values

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = rng.randn(n_samples, n_features)

    rpca = TruncatedSVD(n_components=3,
                        algorithm='randomized',
                        random_state=rng)
    X_rpca = rpca.fit_transform(X)

    X_rpca /= mt.sqrt(mt.sum(X_rpca**2.0, axis=0))
    X_rpca[:, 0] *= 3.142
    X_rpca[:, 1] *= 2.718

    X_hat_rpca = mt.dot(X_rpca, rpca.components_)
    rpca.fit(X_hat_rpca)
    assert_array_almost_equal(rpca.singular_values_.to_numpy(),
                              [3.142, 2.718, 1.0], 14)
示例#10
0
    def testChunkSerialize(self):
        t = ones((10, 3), chunk_size=(5, 2)).tiles()

        # pb
        chunk = t.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.TENSOR_ONES)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.dtype, chunk2.op.dtype)

        # json
        chunk = t.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.dtype, chunk2.op.dtype)

        t = tensor(np.random.random((10, 3)), chunk_size=(5, 2)).tiles()

        # pb
        chunk = t.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]),
                         OperandDef.TENSOR_DATA_SOURCE)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data))

        # json
        chunk = t.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data))

        t = (tensor(np.random.random((10, 3)), chunk_size=(5, 2)) + 1).tiles()

        # pb
        chunk1 = t.chunks[0]
        chunk2 = t.chunks[1]
        fuse_op = TensorFuseChunk()
        composed_chunk = fuse_op.new_chunk(
            chunk1.inputs,
            shape=chunk2.shape,
            _key=chunk2.key,
            _composed=[chunk1.data, chunk2.data])
        serials = self._pb_serial(composed_chunk)
        op, pb = serials[composed_chunk.op, composed_chunk.data]

        self.assertEqual(pb.key, composed_chunk.key)
        self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.FUSE)
        self.assertEqual(len(pb.composed), 2)

        composed_chunk2 = self._pb_deserial(serials)[composed_chunk.data]

        self.assertEqual(composed_chunk.key, composed_chunk2.key)
        self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op))
        self.assertEqual(composed_chunk.composed[0].inputs[0].key,
                         composed_chunk2.composed[0].inputs[0].key)
        self.assertEqual(composed_chunk.inputs[-1].key,
                         composed_chunk2.inputs[-1].key)

        # json
        chunk1 = t.chunks[0]
        chunk2 = t.chunks[1]
        fuse_op = TensorFuseChunk()
        composed_chunk = fuse_op.new_chunk(
            chunk1.inputs,
            shape=chunk2.shape,
            _key=chunk2.key,
            _composed=[chunk1.data, chunk2.data])
        serials = self._json_serial(composed_chunk)

        composed_chunk2 = self._json_deserial(serials)[composed_chunk.data]

        self.assertEqual(composed_chunk.key, composed_chunk2.key)
        self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op))
        self.assertEqual(composed_chunk.composed[0].inputs[0].key,
                         composed_chunk2.composed[0].inputs[0].key)
        self.assertEqual(composed_chunk.inputs[-1].key,
                         composed_chunk2.inputs[-1].key)

        t1 = ones((10, 3), chunk_size=2)
        t2 = ones((3, 5), chunk_size=2)
        c = dot(t1, t2).tiles().chunks[0].inputs[0]

        # pb
        serials = self._pb_serial(c)
        c2 = self._pb_deserial(serials)[c]
        self.assertEqual(c.key, c2.key)

        # json
        serials = self._json_serial(c)
        c2 = self._json_deserial(serials)[c]
        self.assertEqual(c.key, c2.key)
示例#11
0
    def testChunkSerialize(self):
        t = ones((10, 3), chunk_size=(5, 2)).tiles()

        # pb
        chunk = t.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]), opcodes.TENSOR_ONES)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.dtype, chunk2.op.dtype)

        # json
        chunk = t.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.dtype, chunk2.op.dtype)

        t = tensor(np.random.random((10, 3)), chunk_size=(5, 2)).tiles()

        # pb
        chunk = t.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]),
                         opcodes.TENSOR_DATA_SOURCE)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data))

        # json
        chunk = t.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertTrue(np.array_equal(chunk.op.data, chunk2.op.data))

        t1 = tensor(np.random.random((10, 3)), chunk_size=(5, 2))
        t2 = (t1 + 1).tiles()

        # pb
        chunk1 = get_tiled(t1).chunks[0]
        chunk2 = t2.chunks[0]

        composed_chunk = build_fuse_chunk([chunk1.data, chunk2.data],
                                          TensorFuseChunk)
        serials = self._pb_serial(composed_chunk)
        op, pb = serials[composed_chunk.op, composed_chunk.data]

        self.assertEqual(pb.key, composed_chunk.key)
        self.assertEqual(int(op.type.split('.', 1)[1]), opcodes.FUSE)

        composed_chunk2 = self._pb_deserial(serials)[composed_chunk.data]

        self.assertEqual(composed_chunk.key, composed_chunk2.key)
        self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op))
        self.assertEqual(composed_chunk.composed[0].key,
                         composed_chunk2.composed[0].key)
        self.assertEqual(composed_chunk.composed[-1].key,
                         composed_chunk2.composed[-1].key)

        # json
        chunk1 = get_tiled(t1).chunks[0]
        chunk2 = t2.chunks[0]

        composed_chunk = build_fuse_chunk([chunk1.data, chunk2.data],
                                          TensorFuseChunk)
        serials = self._json_serial(composed_chunk)

        composed_chunk2 = self._json_deserial(serials)[composed_chunk.data]

        self.assertEqual(composed_chunk.key, composed_chunk2.key)
        self.assertEqual(type(composed_chunk.op), type(composed_chunk2.op))
        self.assertEqual(composed_chunk.composed[0].key,
                         composed_chunk2.composed[0].key)
        self.assertEqual(composed_chunk.composed[-1].key,
                         composed_chunk2.composed[-1].key)

        t1 = ones((10, 3), chunk_size=2)
        t2 = ones((3, 5), chunk_size=2)
        c = dot(t1, t2).tiles().chunks[0].inputs[0]

        # pb
        serials = self._pb_serial(c)
        c2 = self._pb_deserial(serials)[c]
        self.assertEqual(c.key, c2.key)

        # json
        serials = self._json_serial(c)
        c2 = self._json_deserial(serials)[c]
        self.assertEqual(c.key, c2.key)