def test_tsvd_fit_transform(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_tsvd_inverse_transform(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) X_pd = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': Xcutsvd = cutsvd.fit_transform(X_cudf) else: Xcutsvd = cutsvd.fit_transform(X) input_gdf = cutsvd.inverse_transform(Xcutsvd) cutsvd.handle.sync() assert array_equal(input_gdf, X_cudf, 0.4, with_sign=True)
def test_tsvd_fit(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) sktsvd = skTSVD(n_components=1) sktsvd.fit(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd.fit(gdf) else: cutsvd.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_tsvd_inverse_transform(datatype, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(X) input_gdf = cutsvd.inverse_transform(Xcutsvd) cutsvd.handle.sync() assert array_equal(input_gdf, X, 0.4, with_sign=True)
def test_pca_inverse_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cutsvd.inverse_transform(Xcutsvd) print(input_gdf) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_pca_fit_transform(datatype): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit_transform") cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) sktsvd = skTSVD(n_components=1) Xsktsvd = sktsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=False)
def test_tsvd_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': Xcutsvd = cutsvd.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcutsvd = cutsvd.fit_transform(X) input_gdf = cutsvd.inverse_transform(Xcutsvd) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_tsvd_fit_transform(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) Xcutsvd = cutsvd.fit_transform(gdf) else: Xcutsvd = cutsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_tsvd_fit(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) if input_type == 'dataframe': X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cutsvd.fit(X_cudf) else: cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)