Python Converter示例，spark_sklearn.converter.Converter Python示例

示例#1

0

显示文件

class CVTests(MLlibTestCase):
    def setUp(self):
        super(CVTests, self).setUp()
        self.converter = Converter(self.sc)

    def test_cv_linreg(self):
        pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        X = scipy.sparse.vstack(
            map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100)))
        y = np.array(list(range(0, 100))).reshape((100, 1))
        skl_gs = grid_search.fit(X, y)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])

    def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0),
                ('now is the time', 5.0), ('for what', 0.0),
                ('the spark was there', 5.0), ('and so', 3.0),
                ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])

    @unittest.skip(
        "disable this test until we have numpy <-> dataframe conversion")
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0),
                ('now is the time', 5.0), ('for what', 0.0),
                ('the spark was there', 5.0), ('and so', 3.0),
                ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words",
                              outputCol="features",
                              numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(
            data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
        parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(
            parameters['lasso__alpha'])

示例#2

0

显示文件

文件： test_grid_search_2.py 项目： smurching/spark-sklearn

 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)

示例#3

0

显示文件

文件： test_grid_search_2.py 项目： smurching/spark-sklearn

class CVTests(MLlibTestCase):

    def setUp(self):
        super(CVTests, self).setUp()
        self.converter = Converter(self.sc)

    def test_cv_linreg(self):
        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100)))
        y = np.array(list(range(0, 100))).reshape((100,1))
        skl_gs = grid_search.fit(X, y)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

    def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

    @unittest.skip("disable this test until we have numpy <-> dataframe conversion")
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

示例#4

0

显示文件

 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)

示例#5

0

显示文件

文件： test_grid_search_2.py 项目： weirichd/spark-sklearn

 def setUp(self):
     super(CVTests, self).setUp()
     self.converter = Converter(self.sc)
     self.sc.setLogLevel("WARN")