def test_pca_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) cupca = cuPCA(n_components=2) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca.fit(gdf) else: cupca.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_pca_inverse_transform(datatype): gdf = pygdf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cutsvd.inverse_transform(Xcutsvd) print(input_gdf) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_pca_inverse_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca = cuPCA(n_components=2) Xcupca = cupca.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
def test_pca_fit_transform(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit_transform") cupca = cuPCA(n_components = 2) Xcupca = cupca.fit_transform(gdf) skpca = skPCA(n_components = 2) Xskpca = skpca.fit_transform(X) assert array_equal(Xcupca, Xskpca, 1e-3,with_sign=False)
def test_pca_fit_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit_transform") cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) sktsvd = skTSVD(n_components=1) Xsktsvd = sktsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=False)
def test_tsvd_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': Xcutsvd = cutsvd.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcutsvd = cutsvd.fit_transform(X) input_gdf = cutsvd.inverse_transform(Xcutsvd) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_pca_fit(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit") cupca = cuPCA(n_components = 2) cupca.fit(gdf) skpca = skPCA(n_components = 2) skpca.fit(X) for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_','noise_variance_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cupca,attr),getattr(skpca,attr), 1e-3,with_sign=with_sign)
def test_pca_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca = cuPCA(n_components=2) if input_type == 'dataframe': Xcupca = cupca.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
def test_tsvd_fit_transform(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) Xcutsvd = cutsvd.fit_transform(gdf) else: Xcutsvd = cutsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_dbscan_helper(X, eps, min_samples, threshold, use_assert, test_model): dbscan_imp1 = run_dbscan(X, eps, min_samples, model='sklearn') print() if test_model == 'cuml': X = pd2cudf(X) dbscan_imp2 = run_dbscan(X, eps, min_samples, model=test_model) print() for attr in ['labels_']: passed = array_equal(getattr(dbscan_imp1, attr), getattr(dbscan_imp2, attr), threshold, with_sign=True) message = 'compare pca: %s vs sklearn %s %s' % ( test_model, attr, 'equal' if passed else 'NOT equal') print(message) write_log(message) if use_assert: assert passed, message print() del dbscan_imp1, dbscan_imp2, X
def test_tsvd_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) sktsvd = skTSVD(n_components=1) sktsvd.fit(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd.fit(gdf) else: cutsvd.fit(X) for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_pca_helper(X, n_components, svd_solver, whiten, random_state, threshold, use_assert, test_model): pca_imp1 = run_pca(X, n_components, svd_solver, whiten, random_state, model='sklearn') print() if test_model == 'cuml': X = pd2pygdf(X) elif test_model == 'h2o4gpu': X = np.array(X).astype(np.float32) pca_imp2 = run_pca(X, n_components, svd_solver, whiten, random_state, model=test_model) print() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'transformed_result' ]: with_sign = False if attr in ['components_', 'transformed_result' ] else True passed = array_equal(getattr(pca_imp1, attr), getattr(pca_imp2, attr), threshold, with_sign=with_sign) message = 'compare pca: %s vs sklearn %s %s' % ( test_model, attr, 'equal' if passed else 'NOT equal') print(message) write_log(message) if use_assert: assert passed, message print() del pca_imp1, pca_imp2, X