def test_search(spark_session): sc = spark_session.sparkContext # sklearn variables Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] cv = 5 test_size = 0.2 scoring = "roc_auc" solver = "liblinear" # load sample data (binary target) data = load_breast_cancer() X = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=10) ### distributed grid search model = DistGridSearchCV(LogisticRegression(solver=solver), dict(C=Cs), sc, cv=cv, scoring=scoring) # distributed fitting with spark model.fit(X_train, y_train) # predictions on the driver preds = model.predict(X_test) assert preds.shape == y_test.shape
def main(): cv = 5 clf_scoring = "accuracy" reg_scoring = "neg_mean_squared_error" data = load_iris() X = data["data"] y = data["target"] grid = dict(learning_rate=[.05, .01], max_depth=[4, 6, 8], colsample_bytree=[.6, .8, 1.0], n_estimators=[100, 200, 300]) model = DistGridSearchCV(XGBClassifier(), grid, spark.sparkContext, cv=cv, scoring=clf_scoring) model.fit(X, y) # predictions on the driver preds = model.predict(X) probs = model.predict_proba(X) # results print("-- Grid Search --") print("Best Score: {0}".format(model.best_score_)) print("Best colsample_bytree: {0}".format( model.best_estimator_.colsample_bytree)) print("Best learning_rate: {0}".format( model.best_estimator_.learning_rate)) print("Best max_depth: {0}".format(model.best_estimator_.max_depth)) print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
def test_gs(): X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100) y = np.array([0, 0, 1] * 100) gs = DistGridSearchCV(LogisticRegression(solver="liblinear"), {"C": [0.1, 1.0]}, cv=3) gs.fit(X, y) preds = gs.predict(X[:3]) assert np.allclose(preds, np.array([0, 0, 1]))
# instantiate spark session spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext # the digits dataset digits = datasets.load_digits() X = digits["data"] y = digits["target"] # create a classifier: a support vector classifier classifier = svm.SVC(gamma="scale") param_grid = { "C": [0.01, 0.01, 0.1, 1.0, 10.0], "gamma": ["scale", "auto", 0.001, 0.01, 0.1], "kernel": ["rbf", "poly", "sigmoid"], } scoring = "f1_weighted" cv = 10 # hyperparameter optimization # total fits: 750 start = time.time() model = DistGridSearchCV(classifier, param_grid, sc=sc, cv=cv, scoring=scoring) model.fit(X, y) print("Train time: {0}".format(time.time() - start)) print("Best score: {0}".format(model.best_score_)) results = pd.DataFrame(model.cv_results_).sort_values("mean_test_score", ascending=False) print("-- CV Results --") print(results[["param_C", "param_kernel", "mean_test_score"]].head(10))
df = df[:limit] dataset["target"] = dataset["target"][:limit] # fit a small encoder encoder = Encoderizer(size="small") X_t = encoder.fit_transform(df) # train logistic regression lr = DistGridSearchCV( LogisticRegression(solver="liblinear"), dict(C=[0.1, 1.0, 10.0]), sc, scoring=scoring, cv=cv, ) lr.fit(X_t, dataset["target"]) # train random forest rf = DistGridSearchCV( RandomForestClassifier(n_estimators=10), dict(max_depth=[5, 10]), sc, scoring=scoring, cv=cv, ) rf.fit(X_t, dataset["target"]) # assemble voter and pipeline voter = SimpleVoter([("lr", lr), ("rf", rf)], classes=model.classes_, voting="hard")
test_size=0.2, random_state=4) # initial scaling scaler = StandardScaler() X_train_t = scaler.fit_transform(X_train) X_test_t = scaler.transform(X_test) # sk-dist logistic regression w/ grid search start = time.time() lr = LogisticRegression(solver="lbfgs", multi_class="auto") model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]}, sc=sc, cv=5, scoring="f1_weighted") model.fit(X_train_t, y_train) print("-- sk-dist LR --") print("Train Time: {0}".format(time.time() - start)) print("Best Model CV Score: {0}".format(model.best_score_)) print("Holdout F1: {0}".format( f1_score(y_test, model.predict(X_test_t), average="weighted"))) # sk-dist random forest start = time.time() rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc) rf.fit(X_train_t, y_train) print("-- sk-dist RF --") print("Train Time: {0}".format(time.time() - start)) print("Holdout F1: {0}".format( f1_score(y_test, rf.predict(X_test_t), average="weighted")))
scoring=scoring) # define word/character vector -> feature selection -> tree ensemble both_model = Pipeline(steps=[( "vec", FeatureUnion([( "word", CountVectorizer(analyzer="word", decode_error="ignore") ), ("char", CountVectorizer(analyzer="char_wb", decode_error="ignore"))]) ), ( "select", SelectKBest(f_classif, 1000) ), ("clf", DistExtraTreesClassifier(n_estimators=1000, max_depth=None, sc=sc))]) # fit all models start = time.time() word_model.fit(X_train, y_train) print("Word Model Fit Time: {0}".format(time.time() - start)) start1 = time.time() char_model.fit(X_train, y_train) print("Char Model Fit Time: {0}".format(time.time() - start1)) start2 = time.time() both_model.fit(X_train, y_train) print("Tree Model Fit Time: {0}".format(time.time() - start2)) print("Total Fit Time: {0}".format(time.time() - start)) # construct voter model = SimpleVoter([("word", word_model), ("char", char_model), ("both", both_model)], classes=word_model.classes_,
# define encoder config encoder_config = { "text_col": "string_vectorizer", "categorical_str_col": "onehotencoder", "categorical_int_col": "onehotencoder", "numeric_col": "numeric", "dict_col": "dict", "multilabel_col": "multihotencoder" } # variables Cs = [0.1, 1.0, 10.0] cv = 5 scoring = "f1_weighted" solver = "liblinear" # instantiate encoder with encoder_config, fit/transform on data encoder = Encoderizer(size="small", config=encoder_config) df_transformed = encoder.fit_transform(df) print([i[0] for i in encoder.transformer_list]) # define and fit model model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"), dict(C=Cs), sc, scoring=scoring, cv=cv) model.fit(df_transformed, df["target"]) print(model.best_score_)
# instantiate a pipeline and grid pipe = Pipeline(steps=[ ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")), ("svd", TruncatedSVD()), ("clf", LogisticRegression(solver="liblinear", multi_class="auto")), ]) params = { "clf__C": [0.1, 1.0, 10.0], "vec__ngram_range": [(1, 1), (1, 2)], "svd__n_components": [50, 100], } # fit and select hyperparameters with skdist model0 = DistGridSearchCV(pipe, params, sc, scoring=scoring, cv=cv) model0.fit(X, y) print("A Pipeline used as the base estimator for DistGridSearchCV: {0}".format( model0.best_score_)) # assemble a pipeline with skdist distributed # grid search as the final estimator step model1 = Pipeline(steps=[ ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")), ("svd", TruncatedSVD(n_components=50)), ( "clf", DistGridSearchCV( LogisticRegression(solver="liblinear", multi_class="auto"), {"C": [0.1, 1.0, 10.0]}, sc, scoring=scoring,
solver = "liblinear" # convert training data to pandas df = pd.DataFrame({"text": dataset["data"]}) df = df[:1000] dataset["target"] = dataset["target"][:1000] # fit a small encoder and train classifier encoder = Encoderizer(size="small") X_t = encoder.fit_transform(df) model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"), dict(C=Cs), sc, scoring=scoring, cv=cv) model.fit(X_t, dataset["target"]) print(model.best_score_) # fit a medium encoder and train classifier encoder = Encoderizer(size="medium") X_t = encoder.fit_transform(df) model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"), dict(C=Cs), sc, scoring=scoring, cv=cv) model.fit(X_t, dataset["target"]) print(model.best_score_) # fit a large encoder and train classifier encoder = Encoderizer(size="large")