def main(): cv = 5 clf_scoring = "accuracy" reg_scoring = "neg_mean_squared_error" data = load_iris() X = data["data"] y = data["target"] grid = dict(learning_rate=[.05, .01], max_depth=[4, 6, 8], colsample_bytree=[.6, .8, 1.0], n_estimators=[100, 200, 300]) model = DistGridSearchCV(XGBClassifier(), grid, spark.sparkContext, cv=cv, scoring=clf_scoring) model.fit(X, y) # predictions on the driver preds = model.predict(X) probs = model.predict_proba(X) # results print("-- Grid Search --") print("Best Score: {0}".format(model.best_score_)) print("Best colsample_bytree: {0}".format( model.best_estimator_.colsample_bytree)) print("Best learning_rate: {0}".format( model.best_estimator_.learning_rate)) print("Best max_depth: {0}".format(model.best_estimator_.max_depth)) print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
def test_search(spark_session): sc = spark_session.sparkContext # sklearn variables Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] cv = 5 test_size = 0.2 scoring = "roc_auc" solver = "liblinear" # load sample data (binary target) data = load_breast_cancer() X = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=10) ### distributed grid search model = DistGridSearchCV(LogisticRegression(solver=solver), dict(C=Cs), sc, cv=cv, scoring=scoring) # distributed fitting with spark model.fit(X_train, y_train) # predictions on the driver preds = model.predict(X_test) assert preds.shape == y_test.shape
def test_gs(): X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100) y = np.array([0, 0, 1] * 100) gs = DistGridSearchCV(LogisticRegression(solver="liblinear"), {"C": [0.1, 1.0]}, cv=3) gs.fit(X, y) preds = gs.predict(X[:3]) assert np.allclose(preds, np.array([0, 0, 1]))
# instantiate spark session spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext # the digits dataset digits = datasets.load_digits() X = digits["data"] y = digits["target"] # create a classifier: a support vector classifier classifier = svm.SVC(gamma="scale") param_grid = { "C": [0.01, 0.01, 0.1, 1.0, 10.0], "gamma": ["scale", "auto", 0.001, 0.01, 0.1], "kernel": ["rbf", "poly", "sigmoid"], } scoring = "f1_weighted" cv = 10 # hyperparameter optimization # total fits: 750 start = time.time() model = DistGridSearchCV(classifier, param_grid, sc=sc, cv=cv, scoring=scoring) model.fit(X, y) print("Train time: {0}".format(time.time() - start)) print("Best score: {0}".format(model.best_score_)) results = pd.DataFrame(model.cv_results_).sort_values("mean_test_score", ascending=False) print("-- CV Results --") print(results[["param_C", "param_kernel", "mean_test_score"]].head(10))
limit = 1000 # convert training data to pandas df = pd.DataFrame({"text": dataset["data"]}) df = df[:limit] dataset["target"] = dataset["target"][:limit] # fit a small encoder encoder = Encoderizer(size="small") X_t = encoder.fit_transform(df) # train logistic regression lr = DistGridSearchCV( LogisticRegression(solver="liblinear"), dict(C=[0.1, 1.0, 10.0]), sc, scoring=scoring, cv=cv, ) lr.fit(X_t, dataset["target"]) # train random forest rf = DistGridSearchCV( RandomForestClassifier(n_estimators=10), dict(max_depth=[5, 10]), sc, scoring=scoring, cv=cv, ) rf.fit(X_t, dataset["target"])
y = data["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) # initial scaling scaler = StandardScaler() X_train_t = scaler.fit_transform(X_train) X_test_t = scaler.transform(X_test) # sk-dist logistic regression w/ grid search start = time.time() lr = LogisticRegression(solver="lbfgs", multi_class="auto") model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]}, sc=sc, cv=5, scoring="f1_weighted") model.fit(X_train_t, y_train) print("-- sk-dist LR --") print("Train Time: {0}".format(time.time() - start)) print("Best Model CV Score: {0}".format(model.best_score_)) print("Holdout F1: {0}".format( f1_score(y_test, model.predict(X_test_t), average="weighted"))) # sk-dist random forest start = time.time() rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc) rf.fit(X_train_t, y_train) print("-- sk-dist RF --") print("Train Time: {0}".format(time.time() - start)) print("Holdout F1: {0}".format(
y, test_size=0.2, random_state=2) # define word vector -> regression model word_pipe = Pipeline( steps=[("vec", HashingVectorizer(analyzer="word", decode_error="ignore") ), ("clf", LogisticRegression())]) word_params = { "vec__ngram_range": [(1, 1), (1, 2), (1, 3), (1, 4), (2, 4)], "clf__C": [0.1, 1.0, 10.0], "clf__solver": ["liblinear", "lbfgs"] } word_model = DistGridSearchCV(word_pipe, word_params, sc=sc, cv=cv, scoring=scoring) # define character vector -> regression model char_pipe = Pipeline( steps=[("vec", HashingVectorizer(analyzer="char_wb", decode_error="ignore") ), ("clf", LogisticRegression())]) char_params = { "vec__ngram_range": [(2, 2), (2, 3), (2, 4), (2, 5), (3, 3), (3, 5)], "clf__C": [0.1, 1.0, 10.0], "clf__solver": ["liblinear", "lbfgs"] } char_model = DistGridSearchCV(char_pipe, char_params, sc=sc,
X = digits["data"] y = digits["target"] # create a classifier: a support vector classifier classifier = svm.SVC() param_grid = { "C": [0.01, 0.01, 0.1, 1.0, 10.0, 20.0, 50.0], "gamma": ["scale", "auto", 0.001, 0.01, 0.1], "kernel": ["rbf", "poly", "sigmoid"] } scoring = "f1_weighted" cv = 10 # hyperparameter optimization start = time.time() model_DIST = DistGridSearchCV( classifier, param_grid, sc=sc, cv=cv, scoring=scoring, verbose=True ) model = GridSearchCV( classifier, param_grid, cv=cv, scoring=scoring, verbose=True ) model.fit(X,y) print("Train time for scikit-learn: {0}".format(time.time() - start)) print("Best score: {0}".format(model.best_score_)) print("Train time for sk-dist: {0}".format(time.time() - start)) print("Best score: {0}".format(model_DIST.best_score_))
# define encoder config encoder_config = { "text_col": "string_vectorizer", "categorical_str_col": "onehotencoder", "categorical_int_col": "onehotencoder", "numeric_col": "numeric", "dict_col": "dict", "multilabel_col": "multihotencoder" } # variables Cs = [0.1, 1.0, 10.0] cv = 5 scoring = "f1_weighted" solver = "liblinear" # instantiate encoder with encoder_config, fit/transform on data encoder = Encoderizer(size="small", config=encoder_config) df_transformed = encoder.fit_transform(df) print([i[0] for i in encoder.transformer_list]) # define and fit model model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"), dict(C=Cs), sc, scoring=scoring, cv=cv) model.fit(df_transformed, df["target"]) print(model.best_score_)
y = dataset["target"] # instantiate a pipeline and grid pipe = Pipeline(steps=[ ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")), ("svd", TruncatedSVD()), ("clf", LogisticRegression(solver="liblinear", multi_class="auto")), ]) params = { "clf__C": [0.1, 1.0, 10.0], "vec__ngram_range": [(1, 1), (1, 2)], "svd__n_components": [50, 100], } # fit and select hyperparameters with skdist model0 = DistGridSearchCV(pipe, params, sc, scoring=scoring, cv=cv) model0.fit(X, y) print("A Pipeline used as the base estimator for DistGridSearchCV: {0}".format( model0.best_score_)) # assemble a pipeline with skdist distributed # grid search as the final estimator step model1 = Pipeline(steps=[ ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")), ("svd", TruncatedSVD(n_components=50)), ( "clf", DistGridSearchCV( LogisticRegression(solver="liblinear", multi_class="auto"), {"C": [0.1, 1.0, 10.0]}, sc,
""" =================================================================================== Train distributed CV search with a logistic regression on the breast cancer dataset =================================================================================== In this example we optimize hyperparameters (C) for a logistic regression on the breast cancer dataset. This is a binary target. We use both grid search and randomized search. Here the core difference between skdist and sklearn is to use the sparkContext variable as an argument to the grid search and randomized search class instantiation. Under the hood, skdist will then broadcast the training data out to the executors for each param set, fit the estimator for each param set, return the cross validation score to the driver for each fit, and finally refit the model with the best param set back on the driver. The final estimators are then nearly identical to a fitted sklearn GridSearchCV or RandomizedSearchCV estimator as shown by looking at some of their methods and attributes. Finally, all spark objects are removed from the fitted skdist estimator objects so that these objects are pickle-able as shown. Here is a sample output run: -- Grid Search -- Best Score: 0.9925297825837328 Best C: 1.0 param_C mean_test_score 0 0.001 0.973818
# create dataset X, y = make_classification( n_samples=100000, n_features=40, n_informative=36, n_redundant=1, n_repeated=1, n_classes=40, n_clusters_per_class=1, random_state=5, ) # one nested example model = DistGridSearchCV( DistOneVsRestClassifier(LogisticRegression(solver="liblinear"), sc=sc), {"estimator__C": params}, cv=cv, scoring=scoring, ) model.fit(X, y) print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]]) # another nested example model = DistGridSearchCV( DistOneVsOneClassifier(LogisticRegression(solver="liblinear"), sc=sc), {"estimator__C": params}, cv=cv, scoring=scoring, ) model.fit(X, y) print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])
""" ==================================================================================== Distribute hyperparameter tuning with gradient boosting trees via DistGridSearchCV ==================================================================================== In this example we train a classifier and regression with XGBoost by distributing the hyperparameter tuning through DistGridSearchCV. This should work right out of the box with XGBoost's sklearn wrapper. Given the sequential nature of training estimators on gradient boosting trees, it makes sense to distribute the hyperparameters and cross validation folds, rather than trying to train multiple estimators in parallel. Skdist excels in this functionality by leveraging DistGridSearchCV. In this example, we are able to train 54 unique sets of hyperparameters in parallel and return the the best model to the driver. NOTE: This example uses xgboost==0.90 Here is a sample output run: -- Grid Search -- Best Score: 0.9936882800963308 Best colsample_bytree: 1.0 Best learning_rate: 0.05 Best max_depth: 4 Best n_estimators: 300 DistGridSearchCV(cv=5, error_score='raise-deprecating', estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,