예제 #1
0
 def test_times_array(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                6]))]))
     mat2 = array([[7, 8], [9, 10], [11, 12]])
     truth = [array([58, 64]), array([139, 154])]
     result = mat1.times(mat2).collect()
     assert array_equal(result, truth)
예제 #2
0
 def test_times_rdd(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2))
     mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2))
     truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]])
     resultA = mat1.times(mat2)
     resultB = mat1.times(mat2, "accum")
     assert array_equal(resultA, truth)
     assert array_equal(resultB, truth)
예제 #3
0
 def test_elementwise_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     result = mat1.elementwise(mat2, add).collect()
     truth = array([[8, 10, 12], [14, 16, 18]])
     assert array_equal(result, truth)
예제 #4
0
    def test_outer(self):
        mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))]))
        resultA = mat1.gramian()
        resultB1 = mat1.gramian("accum")
        resultB2 = mat1.gramian("aggregate")
        truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]])
        assert array_equal(resultA, truth)
        assert array_equal(resultB1, truth)
        assert array_equal(resultB2, truth)

# TODO: TestCenter, TestZScore
예제 #5
0
 def test_times_rdd(self):
     mat1 = RowMatrix(
         self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))],
                             2))
     mat2 = RowMatrix(
         self.sc.parallelize([(1, array([7, 8, 9])),
                              (2, array([10, 11, 12]))], 2))
     truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]])
     resultA = mat1.times(mat2)
     resultB = mat1.times(mat2, "accum")
     assert array_equal(resultA, truth)
     assert array_equal(resultB, truth)
예제 #6
0
    def test_outer(self):
        mat1 = RowMatrix(
            self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5,
                                                                   6]))]))
        resultA = mat1.gramian()
        resultB1 = mat1.gramian("accum")
        resultB2 = mat1.gramian("aggregate")
        truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]])
        assert array_equal(resultA, truth)
        assert array_equal(resultB1, truth)
        assert array_equal(resultB2, truth)


# TODO: TestCenter, TestZScore
예제 #7
0
파일: pca.py 프로젝트: vpomponiu/thunder
    def fit(self, data):
        """Estimate principal components

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, or RowMatrix
        """

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        data.center(0)
        svd = SVD(k=self.k, method=self.svdmethod)
        svd.calc(data)

        self.scores = svd.u
        self.latent = svd.s
        self.comps = svd.v

        return self
예제 #8
0
파일: pca.py 프로젝트: NEILKUANG/thunder
    def fit(self, data):
        """Estimate principal components

        Parameters
        ----------
        data : RDD of (tuple, array) pairs, or RowMatrix
        """

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        data.center(0)
        svd = SVD(k=self.k, method=self.svdmethod)
        svd.calc(data)

        self.scores = svd.u
        self.latent = svd.s
        self.comps = svd.v

        return self
예제 #9
0
 def test_times_array(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))]))
     mat2 = array([[7, 8], [9, 10], [11, 12]])
     truth = [array([58, 64]), array([139, 154])]
     result = mat1.times(mat2).collect()
     assert array_equal(result, truth)
예제 #10
0
 def test_elementwise_array(self):
     mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))]))
     assert array_equal(mat.elementwise(2, add).collect()[0], array([3, 4, 5]))
예제 #11
0
 def test_elementwise_rdd(self):
     mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2))
     mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2))
     result = mat1.elementwise(mat2, add).collect()
     truth = array([[8, 10, 12], [14, 16, 18]])
     assert array_equal(result, truth)
예제 #12
0
 def test_elementwise_array(self):
     mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))]))
     assert array_equal(
         mat.elementwise(2, add).collect()[0], array([3, 4, 5]))
예제 #13
0
    def calc(self, mat):
        """
        Calcuate singular vectors

        Parameters
        ----------
        mat : RDD of (tuple, array) pairs, or RowMatrix
            Matrix to compute singular vectors from

        Returns
        ----------
        self : returns an instance of self.
        """
        if type(mat) is not RowMatrix:
            mat = RowMatrix(mat)

        if self.method == "direct":

            # get the normalized gramian matrix
            cov = mat.gramian() / mat.nrows

            # do a local eigendecomposition
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = eigv[:, inds[0:self.k]].T

            # project back into data, normalize by singular values
            u = mat.times(v.T / s)

            self.u = u.rdd
            self.s = s
            self.v = v

        if self.method == "em":

            # initialize random matrix
            c = random.rand(self.k, mat.ncols)
            iter = 0
            error = 100

            # iterative update subspace using expectation maximization
            # e-step: x = (c'c)^-1 c' y
            # m-step: c = y x' (xx')^-1
            while (iter < self.maxiter) & (error > self.tol):
                c_old = c
                # pre compute (c'c)^-1 c'
                c_inv = dot(c.T, inv(dot(c, c.T)))
                # compute (xx')^-1 through a map reduce
                xx = mat.times(c_inv).gramian()
                xx_inv = inv(xx)
                # pre compute (c'c)^-1 c' (xx')^-1
                premult2 = mat.rdd.context.broadcast(dot(c_inv, xx_inv))
                # compute the new c through a map reduce
                c = mat.rows().map(
                    lambda x: outer(x, dot(x, premult2.value))).sum()
                c = c.T

                error = sum(sum((c - c_old)**2))
                iter += 1

            # project data into subspace spanned by columns of c
            # use standard eigendecomposition to recover an orthonormal basis
            c = orth(c.T)
            cov = mat.times(c).gramian() / mat.nrows
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = dot(eigv[:, inds[0:self.k]].T, c.T)
            u = mat.times(v.T / s)

            self.u = u.rdd
            self.s = s
            self.v = v

        return self
예제 #14
0
파일: ica.py 프로젝트: vpomponiu/thunder
    def fit(self, data):
        """
        Fit independent components using an iterative fixed-point algorithm

        Parameters
        ----------
        data: RDD of (tuple, array) pairs, or RowMatrix
            Data to estimate independent components from

        Returns
        ----------
        self : returns an instance of self.
        """

        d = len(data.first()[1])

        if self.k is None:
            self.k = d

        if self.c > self.k:
            raise Exception("number of independent comps " + str(self.c) +
                            " must be less than the number of principal comps " + str(self.k))

        if self.k > d:
            raise Exception("number of principal comps " + str(self.k) +
                            " must be less than the data dimensionality " + str(d))

        if type(data) is not RowMatrix:
            data = RowMatrix(data)

        # reduce dimensionality
        svd = SVD(k=self.k, method=self.svdmethod).calc(data)

        # whiten data
        whtmat = real(dot(inv(diag(svd.s/sqrt(data.nrows))), svd.v))
        unwhtmat = real(dot(transpose(svd.v), diag(svd.s/sqrt(data.nrows))))
        wht = data.times(whtmat.T)

        # do multiple independent component extraction
        if self.seed != 0:
            random.seed(self.seed)
        b = orth(random.randn(self.k, self.c))
        b_old = zeros((self.k, self.c))
        iter = 0
        minabscos = 0
        errvec = zeros(self.maxiter)

        while (iter < self.maxiter) & ((1 - minabscos) > self.tol):
            iter += 1
            # update rule for pow3 non-linearity (TODO: add others)
            b = wht.rows().map(lambda x: outer(x, dot(x, b) ** 3)).sum() / wht.nrows - 3 * b
            # make orthogonal
            b = dot(b, real(sqrtm(inv(dot(transpose(b), b)))))
            # evaluate error
            minabscos = min(abs(diag(dot(transpose(b), b_old))))
            # store results
            b_old = b
            errvec[iter-1] = (1 - minabscos)

        # get un-mixing matrix
        w = dot(b.T, whtmat)

        # get mixing matrix
        a = dot(unwhtmat, b)

        # get components
        sigs = data.times(w.T).rdd

        self.w = w
        self.a = a
        self.sigs = sigs

        return self
예제 #15
0
파일: svd.py 프로젝트: NEILKUANG/thunder
    def calc(self, mat):
        """
        Calcuate singular vectors

        Parameters
        ----------
        mat : RDD of (tuple, array) pairs, or RowMatrix
            Matrix to compute singular vectors from

        Returns
        ----------
        self : returns an instance of self.
        """
        if type(mat) is not RowMatrix:
            mat = RowMatrix(mat)

        if self.method == "direct":

            # get the normalized gramian matrix
            cov = mat.gramian() / mat.nrows

            # do a local eigendecomposition
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = eigv[:, inds[0:self.k]].T

            # project back into data, normalize by singular values
            u = mat.times(v.T / s)

            self.u = u.rdd
            self.s = s
            self.v = v

        if self.method == "em":

            # initialize random matrix
            c = random.rand(self.k, mat.ncols)
            iter = 0
            error = 100

            # iterative update subspace using expectation maximization
            # e-step: x = (c'c)^-1 c' y
            # m-step: c = y x' (xx')^-1
            while (iter < self.maxiter) & (error > self.tol):
                c_old = c
                # pre compute (c'c)^-1 c'
                c_inv = dot(c.T, inv(dot(c, c.T)))
                # compute (xx')^-1 through a map reduce
                xx = mat.times(c_inv).gramian()
                xx_inv = inv(xx)
                # pre compute (c'c)^-1 c' (xx')^-1
                premult2 = mat.rdd.context.broadcast(dot(c_inv, xx_inv))
                # compute the new c through a map reduce
                c = mat.rows().map(lambda x: outer(x, dot(x, premult2.value))).sum()
                c = c.T

                error = sum(sum((c - c_old) ** 2))
                iter += 1

            # project data into subspace spanned by columns of c
            # use standard eigendecomposition to recover an orthonormal basis
            c = orth(c.T)
            cov = mat.times(c).gramian() / mat.nrows
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = dot(eigv[:, inds[0:self.k]].T, c.T)
            u = mat.times(v.T / s)

            self.u = u.rdd
            self.s = s
            self.v = v

        return self