예제 #1
0
    def transform(self, data):
        """Project data into principal component space

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        -------
        scores : RowMatrix, nrows, each of shape (k,)
            The scores (i.e. the representation of the data in PC space)
        """

        if not (isinstance(data, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        mat = data.center(0)
        scores = mat.times(self.comps.T / self.latent)
        return scores
예제 #2
0
파일: pca.py 프로젝트: Young-china/thunder
    def transform(self, data):
        """Project data into principal component space

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            Data to estimate independent components from, must be a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        -------
        scores : RowMatrix, nrows, each of shape (k,)
            The scores (i.e. the representation of the data in PC space)
        """

        if not (isinstance(data, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        mat = data.center(0)
        scores = mat.times(self.comps.T / self.latent)
        return scores
예제 #3
0
 def test_times_array(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))]))
     mat2 = array([[7, 8], [9, 10], [11, 12]])
     truth = [array([58, 64]), array([139, 154])]
     rdd = mat1.times(mat2)
     result = rdd.rows().collect()
     assert array_equal(result, truth)
     assert array_equal(rdd.index, range(0, 2))
예제 #4
0
 def test_outer(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))]))
     resultA = mat1.gramian()
     resultB1 = mat1.gramian("accum")
     resultB2 = mat1.gramian("aggregate")
     truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]])
     assert array_equal(resultA, truth)
     assert array_equal(resultB1, truth)
     assert array_equal(resultB2, truth)
예제 #5
0
 def test_times_array(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                6]))]))
     mat2 = array([[7, 8], [9, 10], [11, 12]])
     truth = [array([58, 64]), array([139, 154])]
     rdd = mat1.times(mat2)
     result = rdd.rows().collect()
     assert array_equal(result, truth)
     assert array_equal(rdd.index, range(0, 2))
예제 #6
0
 def test_times_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]])
     resultA = mat1.times(mat2)
     assert array_equal(resultA, truth)
예제 #7
0
 def test_elementwise_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     result = mat1.elementwise(mat2, add).rows().collect()
     truth = array([[8, 10, 12], [14, 16, 18]])
     assert array_equal(result, truth)
예제 #8
0
 def test_outer(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                6]))]))
     resultA = mat1.gramian()
     resultB1 = mat1.gramian("accum")
     resultB2 = mat1.gramian("aggregate")
     truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]])
     assert array_equal(resultA, truth)
     assert array_equal(resultB1, truth)
     assert array_equal(resultB2, truth)
예제 #9
0
    def generate(self,
                 q=1,
                 p=3,
                 nrows=50,
                 npartitions=10,
                 sigmas=None,
                 seed=None):
        """
        Generate data from a factor analysis model

        Parameters
        ----------
        q : int, optional, default = 1
          The number of factors generating this data

        p : int, optios, default = 3
          The number of observed factors (p >= q)

        nrows : int, optional, default = 50
          Number of observations we have

        sigmas = 1 x p ndarray, optional, default = None
          Scale of the noise to add, randomly generated
          from standard normal distribution if not given
        """
        random.seed(seed)
        # Generate factor loadings (n x q)
        F = matrix(random.randn(nrows, q))
        # Generate factor scores (q x p)
        w = matrix(random.randn(q, p))
        # Generate non-zero the error covariances (1 x p)
        if sigmas is None:
            sigmas = random.randn(1, p)
        # Generate the error terms (n x p)
        # (each row gets scaled by our sigmas)
        epsilon = random.randn(nrows, p) * sigmas
        # Combine this to get our actual data (n x p)
        x = (F * w) + epsilon
        # Put the data in an RDD
        data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions))

        if self.returnParams is True:
            return data, F, w, epsilon
        else:
            return data
    def test_SvdEM(self):
        dataLocal = [
            array([1.0, 2.0, 6.0]),
            array([1.0, 3.0, 0.0]),
            array([1.0, 4.0, 6.0]),
            array([5.0, 1.0, 4.0])
        ]
        data = self.sc.parallelize(zip(range(1, 5), dataLocal))
        mat = RowMatrix(data)

        svd = SVD(k=1, method="em")
        svd.calc(mat)
        uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal))
        uTest = transpose(array(svd.u.rows().collect()))[0]
        vTest = svd.v[0]
        tol = 10e-04  # allow small error for iterative method
        assert(allclose(svd.s[0], sTrue[0], atol=tol))
        assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol))
        assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
예제 #11
0
    def generate(self, nrows=50, ncols=50, npartitions=10, seed=None):
        """
        Generate a matrix where every element is i.i.d. and drawn from a
        standard normal distribution

        Parameters
        ----------
        nrows : int, optional, default = 50
          Number of columns in the generated matrix

        nrows : int, optional, default = 50
          Number of rows in the generated matrix
        """
        random.seed(seed)
        # Generate the data
        x = matrix(random.randn(nrows, ncols))
        # Put the data into an RDD
        data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions))
        return data
    def test_pca(self):
        dataLocal = [
            array([1.0, 1.0, 1.0, 5.0]),
            array([2.0, 3.0, 4.0, 1.0]),
            array([6.0, 0.0, 6.0, 6.0])
        ]
        data = self.sc.parallelize(zip(range(1, 4), dataLocal))
        mat = RowMatrix(data)

        pca1 = PCA(k=1, svdMethod='direct')
        pca1.fit(mat)
        out1_comps = pca1.comps
        out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent
        out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent

        from sklearn.decomposition import PCA as skPCA
        pca2 = skPCA(n_components=1)
        pca2.fit(array(dataLocal))
        out2_comps = pca2.components_
        out2_scores = pca2.transform(array(dataLocal))

        assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps))
        assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores))
        assert(allclose(out1_scores, out1_transform_scores))
예제 #13
0
 def test_times_rdd(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2))
     mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2))
     truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]])
     resultA = mat1.times(mat2)
     assert array_equal(resultA, truth)
예제 #14
0
 def test_elementwise_array(self):
     mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))]))
     assert array_equal(mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))
예제 #15
0
 def test_elementwise_rdd(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2))
     mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2))
     result = mat1.elementwise(mat2, add).rows().collect()
     truth = array([[8, 10, 12], [14, 16, 18]])
     assert array_equal(result, truth)
예제 #16
0
 def test_elementwise_array(self):
     mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))]))
     assert array_equal(
         mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))