def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)
def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X, X_rdd = self.generate_dataset() result_local = local.fit_transform(X) result_dist = np.vstack(dist.fit_transform(X_rdd).collect()) assert_array_almost_equal(result_local, result_dist) X, X_rdd = self.generate_sparse_dataset() result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_pipeline_init(self): # Test the various init parameters of the pipeline. assert_raises(TypeError, SparkPipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = SparkPipeline([('svc', clf)]) assert_equal(pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) )) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects vect = SparkCountVectorizer() filter = SparkVarianceThreshold() pipe = SparkPipeline([('vect', vect), ('filter', filter)]) # Check that we can't use the same stage name twice assert_raises(ValueError, SparkPipeline, [('vect', vect), ('vect', vect)]) # Check that params are set pipe.set_params(vect__min_df=0.1) assert_equal(vect.min_df, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, filter__min_df=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('vect') params.pop('filter') params2.pop('vect') params2.pop('filter') assert_equal(params, params2)
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) local.fit(X_dense) dist.fit(X_dense_rdd) assert_array_almost_equal(local.variances_, dist.variances_) local.fit(X_sparse) dist.fit(X_sparse_rdd) assert_array_almost_equal(local.variances_, dist.variances_) dist.fit(Z) assert_array_almost_equal(local.variances_, dist.variances_)
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = np.vstack(dist.fit_transform(X_dense_rdd).collect()) assert_array_almost_equal(result_local, result_dist) result_local = local.fit_transform(X_sparse) result_dist = sp.vstack(dist.fit_transform(X_sparse_rdd).collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = sp.vstack(dist.fit_transform(Z_rdd)[:, 'X'].collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X, X_rdd = self.generate_dataset(shape, block_size) local.fit(X) dist.fit(X_rdd) assert_array_almost_equal(local.variances_, dist.variances_) X, X_rdd = self.generate_sparse_dataset() local.fit(X) dist.fit(X_rdd) assert_array_almost_equal(local.variances_, dist.variances_)
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray, ))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) local.fit(X_dense) dist.fit(X_dense_rdd) assert_array_almost_equal(local.variances_, dist.variances_) local.fit(X_sparse) dist.fit(X_sparse_rdd) assert_array_almost_equal(local.variances_, dist.variances_) dist.fit(Z) assert_array_almost_equal(local.variances_, dist.variances_)