class MatrixUDTTests(MLlibTestCase): dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) udt = MatrixUDT() def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for m in [self.dm1, self.dm2, self.sm1, self.sm2]: self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) def test_infer_schema(self): rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) df = rdd.toDF() schema = df.schema self.assertTrue(schema.fields[1].dataType, self.udt) matrices = df.rdd.map(lambda x: x._2).collect() self.assertEqual(len(matrices), 2) for m in matrices: if isinstance(m, DenseMatrix): self.assertTrue(m, self.dm1) elif isinstance(m, SparseMatrix): self.assertTrue(m, self.sm1) else: raise ValueError("Expected a matrix but got type %r" % type(m))
def test_repr_sparse_matrix(self): sm1t = SparseMatrix( 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], isTransposed=True) self.assertTrue( repr(sm1t), 'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)') indices = tile(arange(6), 3) values = ones(18) sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values) self.assertTrue( repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \ [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)") self.assertTrue( str(sm), "6 X 3 CSCMatrix\n\ (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\ (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\ (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..") sm = SparseMatrix(1, 18, zeros(19), [], []) self.assertTrue( repr(sm), 'SparseMatrix(1, 18, \ [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
def difun(x, vect): if x[0] == x[1]: sm = SparseMatrix(p, p, np.linspace(0, p, num = (p+1)), \ np.linspace(0, p-1, num = p), vect[(x[0]*p):((x[0]+1)*p)]) return (x, sm) else: h = sparse.csc_matrix((p, p)) return (x, Matrices.sparse(p, p, h.indptr, h.indices, h.data))
def test_ml_mllib_matrix_conversion(self): # to ml # dense mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM2 = mllibDM.asML() self.assertEqual(mlDM2, mlDM1) # transposed mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt2 = mllibDMt.asML() self.assertEqual(mlDMt2, mlDMt1) # sparse mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM2 = mllibSM.asML() self.assertEqual(mlSM2, mlSM1) # transposed mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt2 = mllibSMt.asML() self.assertEqual(mlSMt2, mlSMt1) # from ml # dense mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) mllibDM2 = Matrices.fromML(mlDM) self.assertEqual(mllibDM1, mllibDM2) # transposed mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) mllibDMt2 = Matrices.fromML(mlDMt) self.assertEqual(mllibDMt1, mllibDMt2) # sparse mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mllibSM2 = Matrices.fromML(mlSM) self.assertEqual(mllibSM1, mllibSM2) # transposed mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mllibSMt2 = Matrices.fromML(mlSMt) self.assertEqual(mllibSMt1, mllibSMt2)
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0]))) self._test_serialize(DenseVector(pyarray.array("d", range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2})) self._test_serialize(SparseVector(3, {})) self._test_serialize(DenseMatrix(2, 3, range(6))) sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) self._test_serialize(sm1)
def difun(x, vect): if (x[0] == x[1]): sm = SparseMatrix( SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, np.linspace(0, SQUARE_BLOCK_SIZE, num=(SQUARE_BLOCK_SIZE + 1)), np.linspace(0, SQUARE_BLOCK_SIZE - 1, num=SQUARE_BLOCK_SIZE), vect[(x[0] * SQUARE_BLOCK_SIZE):((x[0] + 1) * SQUARE_BLOCK_SIZE)]) return (x, sm) else: h = sparse.csc_matrix((SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE)) return (x, Matrices.sparse(SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, h.indptr, h.indices, h.data))
def test_sparse_matrix(self): # Test sparse matrix creation. sm1 = SparseMatrix( 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) self.assertEqual(sm1.numRows, 3) self.assertEqual(sm1.numCols, 4) self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4]) self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2]) self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0]) self.assertTrue( repr(sm1), 'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)') # Test indexing expected = [ [0, 0, 0, 0], [1, 0, 4, 0], [2, 0, 5, 0]] for i in range(3): for j in range(4): self.assertEqual(expected[i][j], sm1[i, j]) self.assertTrue(array_equal(sm1.toArray(), expected)) for i, j in [(-1, 1), (4, 3), (3, 5)]: self.assertRaises(IndexError, sm1.__getitem__, (i, j)) # Test conversion to dense and sparse. smnew = sm1.toDense().toSparse() self.assertEqual(sm1.numRows, smnew.numRows) self.assertEqual(sm1.numCols, smnew.numCols) self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs)) self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices)) self.assertTrue(array_equal(sm1.values, smnew.values)) sm1t = SparseMatrix( 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], isTransposed=True) self.assertEqual(sm1t.numRows, 3) self.assertEqual(sm1t.numCols, 4) self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5]) self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2]) self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0]) expected = [ [3, 2, 0, 0], [0, 0, 4, 0], [9, 0, 8, 0]] for i in range(3): for j in range(4): self.assertEqual(expected[i][j], sm1t[i, j]) self.assertTrue(array_equal(sm1t.toArray(), expected))
def difun(self, x, vect): squareBlockSize = copy.deepcopy(self.squareBlockSize) if (x[0] == x[1]): sm = SparseMatrix( squareBlockSize, squareBlockSize, np.linspace(0, squareBlockSize, num=(squareBlockSize + 1)), np.linspace(0, squareBlockSize - 1, num=squareBlockSize), vect[(x[0] * squareBlockSize):((x[0] + 1) * squareBlockSize)]) return (x, sm) else: h = sparse.csc_matrix((squareBlockSize, squareBlockSize)) return (x, Matrices.sparse(squareBlockSize, squareBlockSize, h.indptr, h.indices, h.data))
def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) dm1 = DenseMatrix(2, 2, [2, 0, 0, 0]) sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2]) self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) # this is done as Dense and Sparse matrices can be semantically # equal while still implementing a different __eq__ method self.assertEqual(dm1, sm1) self.assertEqual(sm1, dm1)