示例#1
0
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
 def test_cv_pipeline(self):
     pipeline = SKL_Pipeline([
         ('vect', SKL_HashingVectorizer(n_features=20)),
         ('tfidf', SKL_TfidfTransformer(use_idf=False)),
         ('lasso', SKL_Lasso(max_iter=1))
     ])
     parameters = {
         'lasso__alpha': (0.001, 0.005, 0.01)
     }
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     data = [('hi there', 0.0),
             ('what is up', 1.0),
             ('huh', 1.0),
             ('now is the time', 5.0),
             ('for what', 0.0),
             ('the spark was there', 5.0),
             ('and so', 3.0),
             ('were many socks', 0.0),
             ('really', 1.0),
             ('too cool', 2.0)]
     df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
     skl_gs = grid_search.fit(df.review.values, df.rating.values)
     assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
     # TODO
     for gs in skl_gs.grid_scores_:
         pass # assert(gs.)
示例#3
0
 def test_cv_linreg(self):
     pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
     parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     X = scipy.sparse.vstack(
         map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100)))
     y = np.array(list(range(0, 100))).reshape((100, 1))
     skl_gs = grid_search.fit(X, y)
     assert len(skl_gs.cv_results_['params']) == len(
         parameters['lasso__alpha'])