def transform(self, data): """Project data into principal component space Parameters ---------- data : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ------- scores : RowMatrix, nrows, each of shape (k,) The scores (i.e. the representation of the data in PC space) """ if not (isinstance(data, Series)): raise Exception('Input must be Series or a subclass (e.g. RowMatrix)') if type(data) is not RowMatrix: data = RowMatrix(data) mat = data.center(0) scores = mat.times(self.comps.T / self.latent) return scores
def test_times_array(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) mat2 = array([[7, 8], [9, 10], [11, 12]]) truth = [array([58, 64]), array([139, 154])] rdd = mat1.times(mat2) result = rdd.rows().collect() assert array_equal(result, truth) assert array_equal(rdd.index, range(0, 2))
def test_outer(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) resultA = mat1.gramian() resultB1 = mat1.gramian("accum") resultB2 = mat1.gramian("aggregate") truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]]) assert array_equal(resultA, truth) assert array_equal(resultB1, truth) assert array_equal(resultB2, truth)
def test_times_array(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) mat2 = array([[7, 8], [9, 10], [11, 12]]) truth = [array([58, 64]), array([139, 154])] rdd = mat1.times(mat2) result = rdd.rows().collect() assert array_equal(result, truth) assert array_equal(rdd.index, range(0, 2))
def test_times_rdd(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix( self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]]) resultA = mat1.times(mat2) assert array_equal(resultA, truth)
def test_elementwise_rdd(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix( self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) result = mat1.elementwise(mat2, add).rows().collect() truth = array([[8, 10, 12], [14, 16, 18]]) assert array_equal(result, truth)
def test_outer(self): mat1 = RowMatrix( self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))])) resultA = mat1.gramian() resultB1 = mat1.gramian("accum") resultB2 = mat1.gramian("aggregate") truth = array([[17, 22, 27], [22, 29, 36], [27, 36, 45]]) assert array_equal(resultA, truth) assert array_equal(resultB1, truth) assert array_equal(resultB2, truth)
def generate(self, q=1, p=3, nrows=50, npartitions=10, sigmas=None, seed=None): """ Generate data from a factor analysis model Parameters ---------- q : int, optional, default = 1 The number of factors generating this data p : int, optios, default = 3 The number of observed factors (p >= q) nrows : int, optional, default = 50 Number of observations we have sigmas = 1 x p ndarray, optional, default = None Scale of the noise to add, randomly generated from standard normal distribution if not given """ random.seed(seed) # Generate factor loadings (n x q) F = matrix(random.randn(nrows, q)) # Generate factor scores (q x p) w = matrix(random.randn(q, p)) # Generate non-zero the error covariances (1 x p) if sigmas is None: sigmas = random.randn(1, p) # Generate the error terms (n x p) # (each row gets scaled by our sigmas) epsilon = random.randn(nrows, p) * sigmas # Combine this to get our actual data (n x p) x = (F * w) + epsilon # Put the data in an RDD data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions)) if self.returnParams is True: return data, F, w, epsilon else: return data
def test_SvdEM(self): dataLocal = [ array([1.0, 2.0, 6.0]), array([1.0, 3.0, 0.0]), array([1.0, 4.0, 6.0]), array([5.0, 1.0, 4.0]) ] data = self.sc.parallelize(zip(range(1, 5), dataLocal)) mat = RowMatrix(data) svd = SVD(k=1, method="em") svd.calc(mat) uTrue, sTrue, vTrue = LinAlg.svd(array(dataLocal)) uTest = transpose(array(svd.u.rows().collect()))[0] vTest = svd.v[0] tol = 10e-04 # allow small error for iterative method assert(allclose(svd.s[0], sTrue[0], atol=tol)) assert(allclose(vTest, vTrue[0, :], atol=tol) | allclose(-vTest, vTrue[0, :], atol=tol)) assert(allclose(uTest, uTrue[:, 0], atol=tol) | allclose(-uTest, uTrue[:, 0], atol=tol))
def generate(self, nrows=50, ncols=50, npartitions=10, seed=None): """ Generate a matrix where every element is i.i.d. and drawn from a standard normal distribution Parameters ---------- nrows : int, optional, default = 50 Number of columns in the generated matrix nrows : int, optional, default = 50 Number of rows in the generated matrix """ random.seed(seed) # Generate the data x = matrix(random.randn(nrows, ncols)) # Put the data into an RDD data = RowMatrix(self.sc.parallelize(self.appendKeys(x), npartitions)) return data
def test_pca(self): dataLocal = [ array([1.0, 1.0, 1.0, 5.0]), array([2.0, 3.0, 4.0, 1.0]), array([6.0, 0.0, 6.0, 6.0]) ] data = self.sc.parallelize(zip(range(1, 4), dataLocal)) mat = RowMatrix(data) pca1 = PCA(k=1, svdMethod='direct') pca1.fit(mat) out1_comps = pca1.comps out1_scores = pca1.scores.collectValuesAsArray() * pca1.latent out1_transform_scores = pca1.transform(mat).collectValuesAsArray() * pca1.latent from sklearn.decomposition import PCA as skPCA pca2 = skPCA(n_components=1) pca2.fit(array(dataLocal)) out2_comps = pca2.components_ out2_scores = pca2.transform(array(dataLocal)) assert(allclose(out1_comps, out2_comps) | allclose(out1_comps, -out2_comps)) assert(allclose(out1_scores, out2_scores) | allclose(out1_scores, -out2_scores)) assert(allclose(out1_scores, out1_transform_scores))
def test_times_rdd(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) truth = array([[47, 52, 57], [64, 71, 78], [81, 90, 99]]) resultA = mat1.times(mat2) assert array_equal(resultA, truth)
def test_elementwise_array(self): mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))])) assert array_equal(mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))
def test_elementwise_rdd(self): mat1 = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3])), (2, array([4, 5, 6]))], 2)) mat2 = RowMatrix(self.sc.parallelize([(1, array([7, 8, 9])), (2, array([10, 11, 12]))], 2)) result = mat1.elementwise(mat2, add).rows().collect() truth = array([[8, 10, 12], [14, 16, 18]]) assert array_equal(result, truth)
def test_elementwise_array(self): mat = RowMatrix(self.sc.parallelize([(1, array([1, 2, 3]))])) assert array_equal( mat.elementwise(2, add).rows().collect()[0], array([3, 4, 5]))