示例#1
0
class CVTests(MLlibTestCase):
    def setUp(self):
        super(CVTests, self).setUp()
        self.converter = Converter(self.sc)

    def test_cv_linreg(self):
        pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        X = scipy.sparse.vstack(
            map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100)))
        y = np.array(list(range(0, 100))).reshape((100, 1))
        skl_gs = grid_search.fit(X, y)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])

    def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0),
                ('now is the time', 5.0), ('for what', 0.0),
                ('the spark was there', 5.0), ('and so', 3.0),
                ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])

    @unittest.skip(
        "disable this test until we have numpy <-> dataframe conversion")
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0),
                ('now is the time', 5.0), ('for what', 0.0),
                ('the spark was there', 5.0), ('and so', 3.0),
                ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words",
                              outputCol="features",
                              numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(
            data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])
 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)
class CVTests(MLlibTestCase):

    def setUp(self):
        super(CVTests, self).setUp()
        self.converter = Converter(self.sc)

    def test_cv_linreg(self):
        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100)))
        y = np.array(list(range(0, 100))).reshape((100,1))
        skl_gs = grid_search.fit(X, y)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

    def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

    @unittest.skip("disable this test until we have numpy <-> dataframe conversion")
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
示例#4
0
 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)
 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)
     self.sc.setLogLevel("WARN")