def test_example(self): # The classic example from the sklearn documentation iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svr = svm.SVC() clf = grid_search.GridSearchCV(svr, parameters) clf.fit(iris.data, iris.target) clf2 = GridSearchCV(self.sc, svr, parameters) clf2.fit(iris.data, iris.target) b1 = clf.estimator b2 = clf2.estimator self.assertEqual(b1.get_params(), b2.get_params())
def test_example(self): # The classic example from the sklearn documentation iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC(gamma='auto') clf = grid_search.GridSearchCV(svr, parameters) clf.fit(iris.data, iris.target) clf2 = GridSearchCV(self.sc, svr, parameters) clf2.fit(iris.data, iris.target) b1 = clf.estimator b2 = clf2.estimator self.assertEqual(b1.get_params(), b2.get_params())
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def test_cv_pipeline(self): pipeline = SKL_Pipeline([ ('vect', SKL_HashingVectorizer(n_features=20)), ('tfidf', SKL_TfidfTransformer(use_idf=False)), ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas() skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha']) # TODO for gs in skl_gs.grid_scores_: pass # assert(gs.)
def test_cv_linreg(self): pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))]) parameters = {'lasso__alpha': (0.001, 0.005, 0.01)} grid_search = GridSearchCV(self.sc, pipeline, parameters) X = scipy.sparse.vstack( map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100))) y = np.array(list(range(0, 100))).reshape((100, 1)) skl_gs = grid_search.fit(X, y) assert len(skl_gs.cv_results_['params']) == len( parameters['lasso__alpha'])
# data 표준화 scaler = StandardScaler() scaler.fit(X_train) StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) from pyspark.sql import SparkSession # spark context spark = SparkSession.builder.appName("Regression_worker_2").getOrCreate() sc = spark.sparkContext # model 초기화 MLP_model = GridSearchCV(sc, MLPRegressor(alpha=0.005, random_state=42), {'hidden_layer_sizes':[[512, 4], [256, 4]], 'max_iter':[5000]}) #linear_model.fit(X_train, y_train) MLP_model.fit(X_train, y_train) #RandomForest_model.fit(X_train, y_train) #GradientBoosting_model.fit(X_train, y_train) # print scores models = [MLP_model] with open('./model_scores_worker_2.txt', 'w') as f: for m in models: f.write('Training Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_train, m.predict(X_train)))) f.write('Training Set R^2: {:.2f}\n'.format(r2_score(y_train, m.predict(X_train)))) f.write('Testing Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_test, m.predict(X_test))))
X_train, X_test, y_train, y_test = train_test_split(X, y) # data 표준화 scaler = StandardScaler() scaler.fit(X_train) StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) from pyspark.sql import SparkSession # spark context spark = SparkSession.builder.appName("Regression_compare_models").getOrCreate() sc = spark.sparkContext # model 초기화 linear_model = GridSearchCV(sc, LinearRegression(), {}) MLP_model = GridSearchCV( sc, MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42), {}) RandomForest_model = GridSearchCV( sc, RandomForestRegressor(n_estimators=100, random_state=0), {}) GradientBoosting_model = GridSearchCV( sc, GradientBoostingRegressor(n_estimators=100, max_depth=10, criterion='mse'), {}) linear_model.fit(X_train, y_train) MLP_model.fit(X_train, y_train)
digits = datasets.load_digits() X, y = digits.data, digits.target sc = createLocalSparkSession().sparkContext param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [0.1, 0.2, 0.3], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [10, 20, 40, 80] } gs = GridSearchCV(sc, RandomForestClassifier(), param_grid=param_grid) gs.fit(X, y) # 获取最佳参数 best_params_ = None best_score_ = 0 params = gs.cv_results_['params'] mean_train_score = gs.cv_results_['mean_train_score'] for i, score in enumerate(mean_train_score): if i == 0: best_score_ = score best_params_ = params[i] if score > best_score_: best_score_ = score best_params_ = params[i] print(best_params_)