def test_svd_daal_vs_sklearn(rows=1000, columns=1000): indata = get_random_array(rows, columns) daal_input = HomogenNumericTable(indata) algorithm = svd.Batch() algorithm.input.set(svd.data, daal_input) start_sklearn = time.time() _U, s, _Vh = np.linalg.svd(indata, full_matrices=False) end_sklearn = time.time() start_daal = time.time() result = algorithm.compute() end_daal = time.time() if os.getenv("CHECKPERFORMANCE") is not None: assert (end_daal - start_daal <= end_sklearn - start_sklearn) sigma = getNumpyArray(result.get(svd.singularValues)) _rows, cols = sigma.shape d_sigma = sigma.reshape(cols, ) assert_array_almost_equal(d_sigma, s) print("SVD for matrix[{}][{}]".format(rows, columns)) print("+ Sklearn SVD: {}".format(end_sklearn - start_sklearn)) print("+ Sklearn Daal: {}".format(end_daal - start_daal))
def test_svd_simple(): indata = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) dataSource = HomogenNumericTable(indata) _in_rows, in_columns = indata.shape algorithm = svd.Batch(method=svd.defaultDense, leftSingularMatrix=svd.requiredInPackedForm, rightSingularMatrix=svd.requiredInPackedForm) algorithm.input.set(svd.data, dataSource) result = algorithm.compute() sigma = getNumpyArray(result.get(svd.singularValues)) U = getNumpyArray(result.get(svd.leftSingularMatrix)) V = getNumpyArray(result.get(svd.rightSingularMatrix)) assert sigma.shape[1] == in_columns assert indata.shape == U.shape assert in_columns == V.shape[0] == V.shape[1] assert_array_almost_equal(np.array([[14.269, 0.6268]]), sigma, decimal=4) assert_array_almost_equal(np.array([[-0.152, -0.823], [-0.350, -0.421], [-0.547, -0.020], [-0.745, 0.381]]), U, decimal=3) assert_array_almost_equal(np.array([[-0.641, -0.767], [0.767, -0.641]]), V, decimal=3)
def fit_transform(self, X, y=None): ''' Fit SVD to X :param X: array-like shape n_samples x n_features(n_components) TODO@monika: sparse matrix :param y: None :return: self object, returns the transformer object ''' _ = y hdd = IInput.HomogenousDaalData(X) input_type = hdd.informat def column_lambda(input_, components): if components <= input_.shape[1]: return input_[:, 0:components] return input_ if input_type == 'numpy': X = column_lambda(X, self.n_components) elif input_type == 'pandas': X = column_lambda(X.as_matrix(), self.n_components) else: pass # CSV column size is not supported Input = hdd.getNumericTable() algorithm = svd.Batch( method=svd.defaultDense, leftSingularMatrix=self.parameters['leftSingularMatrix'], rightSingularMatrix=self.parameters['rightSingularMatrix']) algorithm.input.set(svd.data, Input) # compute SVD decomposition result = algorithm.compute() U, Sigma, VT = result.get(svd.leftSingularMatrix), \ result.get(svd.singularValues), \ result.get(svd.rightSingularMatrix) # transform result to numpy array self._U = IInput.getNumpyArray(nT=U) self._Q = IInput.getNumpyArray(nT=VT) sigma = IInput.getNumpyArray(nT=Sigma) _, cols = sigma.shape self._w = sigma.reshape(cols,) # Calculate explained variance & explained variance ratio X_transformed = self._U * self._w self.explained_variance = exp_var = np.var(X_transformed, axis=0) # todo @Monika: support csr, crs full_var = np.var(X, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var return X_transformed
def test_svd_simple_check(): indata = np.array([[1, 3, 4], [5, 6, 9], [1, 2, 3], [7, 6, 8]]) dataSource = HomogenNumericTable(indata) algorithm = svd.Batch() algorithm.input.set(svd.data, dataSource) result = algorithm.compute() sigma = getNumpyArray(result.get(svd.singularValues)) U = getNumpyArray(result.get(svd.leftSingularMatrix)) V = getNumpyArray(result.get(svd.rightSingularMatrix)) # create diagonal matrix of Singular values _rows, cols = sigma.shape d_sigma = sigma.reshape(cols, ) outdata = np.dot(U, np.dot(np.diag(d_sigma), V)) assert_array_almost_equal(outdata, indata)