def test_pipeline_same_results(self): X, y, Z = self.make_classification(2, 10000, 2000) loc_clf = LogisticRegression() loc_filter = VarianceThreshold() loc_pipe = Pipeline([ ('threshold', loc_filter), ('logistic', loc_clf) ]) dist_clf = SparkLogisticRegression() dist_filter = SparkVarianceThreshold() dist_pipe = SparkPipeline([ ('threshold', dist_filter), ('logistic', dist_clf) ]) dist_filter.fit(Z) loc_pipe.fit(X, y) dist_pipe.fit(Z, logistic__classes=np.unique(y)) assert_true(np.mean(np.abs( loc_pipe.predict(X) - np.concatenate(dist_pipe.predict(Z[:, 'X']).collect()) )) < 0.1)
def test_pipeline_init(self): # Test the various init parameters of the pipeline. assert_raises(TypeError, SparkPipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = SparkPipeline([('svc', clf)]) assert_equal(pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) )) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects vect = SparkCountVectorizer() filter = SparkVarianceThreshold() pipe = SparkPipeline([('vect', vect), ('filter', filter)]) # Check that we can't use the same stage name twice assert_raises(ValueError, SparkPipeline, [('vect', vect), ('vect', vect)]) # Check that params are set pipe.set_params(vect__min_df=0.1) assert_equal(vect.min_df, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, filter__min_df=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('vect') params.pop('filter') params2.pop('vect') params2.pop('filter') assert_equal(params, params2)
def test_pipeline_init(self): # Test the various init parameters of the pipeline. assert_raises(TypeError, SparkPipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = SparkPipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects vect = SparkCountVectorizer() filter = SparkVarianceThreshold() pipe = SparkPipeline([('vect', vect), ('filter', filter)]) # Check that we can't use the same stage name twice assert_raises(ValueError, SparkPipeline, [('vect', vect), ('vect', vect)]) # Check that params are set pipe.set_params(vect__min_df=0.1) assert_equal(vect.min_df, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, filter__min_df=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('vect') params.pop('filter') params2.pop('vect') params2.pop('filter') assert_equal(params, params2)
data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.25, random_state=43) # train data toRDD train_x = sc.parallelize(data_train) train_y = sc.parallelize(target_train) train_x = ArrayRDD(train_x) train_y = ArrayRDD(train_y) Z = DictRDD((train_x, train_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # pipeline dist_pipeline = SparkPipeline(( ('vect', SparkHashingVectorizer(non_negative=True)), # hashingTF for NB ('tfidf', SparkTfidfTransformer()), # IDF ('clf', SparkMultinomialNB(alpha=0.05)) # NB )) # fit dist_pipeline.fit(Z, clf__classes=np.array([0, 1])) # test data to RDD test_x = ArrayRDD(sc.parallelize(data_test)) test_y = ArrayRDD(sc.parallelize(target_test)) test_Z = DictRDD((test_x, test_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # predict test data predicts = dist_pipeline.predict(test_Z[:, 'X'])