def gridsearch(x, y, cv): scoring = { 'auc_score': 'roc_auc', 'accuracy': 'accuracy', 'scores_p_1': 'precision', 'scores_r_1': 'recall', 'scores_f_1_1': 'f1', 'scores_p_0': make_scorer(precision_0), 'scores_r_0': make_scorer(recall_0), 'scores_f_1_0': make_scorer(f1_0), 'mcc': make_scorer(matthews_corrcoef), 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro', 'recall_micro': 'recall_micro', 'f1_macro': 'f1_macro', 'f1_micro': 'f1_micro' } grid_search = GridSearchCV( SVC(kernel='rbf', probability=True), param_grid={ 'C': [1000, 500, 250, 100, 50, 25, 1, 0.1, 0.01, 0.001, 0.0001], 'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001] }, scoring=scoring, cv=cv, n_jobs=40, refit='auc_score', verbose=2) grid_search.fit(x, y) return grid_search
def gridsearch_linear(x, y, cv): # 1000, 500, 200, 100, 50,20, 10, 2, 1, 0.2, 0.5,0.01, 0.02, 0.05, 0.001 ''' param_grid={'C': [1000, 500, 200, 100, 50, 20, 10, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001], 'gamma': [1000, 500, 200, 100, 50, 20, 10, 5, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001, 0.0001]},''' scoring = { 'auc_score': 'roc_auc', 'accuracy': 'accuracy', 'scores_p_1': 'precision', 'scores_r_1': 'recall', 'scores_f_1_1': 'f1', 'scores_p_0': make_scorer(precision_0), 'scores_r_0': make_scorer(recall_0), 'scores_f_1_0': make_scorer(f1_0), 'mcc': make_scorer(matthews_corrcoef), 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro', 'recall_micro': 'recall_micro', 'f1_macro': 'f1_macro', 'f1_micro': 'f1_micro' } grid_search = GridSearchCV(LinearSVC(max_iter=1000), param_grid={ 'penalty': ['l2'], 'C': [ 1000, 500, 200, 100, 50, 20, 10, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001 ] }, scoring=scoring, cv=cv, n_jobs=40, refit='auc_score', verbose=2) '''grid_search = GridSearchCV(LinearSVC(max_iter=1000), param_grid={ 'penalty' : ['l2'], 'C': [1000, 500, 200, 100, 50, 20, 10, 2, 1, 0.2, 0.5, 0.01, 0.02, 0.05, 0.001]}, scoring={'accuracy','roc_auc'}, cv=cv, n_jobs=-1, refit='accuracy') grid_search = GridSearchCV(SVC(kernel='rbf', cache_size=2000, probability=True), param_grid={'C': [10000, 5000, 1], 'gamma': ['scale']}, scoring={'accuracy','roc_auc'}, cv=cv, n_jobs=-1, refit='accuracy')''' grid_search.fit(x, y) return grid_search
def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2OSVD pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], "standardize__scale": [True, False], "svd__nv": [2, 3], "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), "svd__transform": ["none", "standardize"], } custom_cv = H2OKFold(arrests, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(arrests[1:], arrests[0]) print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import(): from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()), ("rf", H2ORandomForestEstimator()) ]) params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50,60), "rf__max_depth": randint(4,8), "rf__min_rows": randint(5,10), "pca__transform": ["none", "standardize"], } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:],iris[0]) print(random_search.best_estimator_)
def selectTasks(): while True: print "\nSelect the Model for classification:" print "Enter 1 : Logistic Regression" print "Enter 2 : Naive Bayes" print "Enter 3 : Support Vector Machne Model using SKlearn library" print "Enter 4 : Random Forest Model using SKlearn library" print "Enter 5 : To exit!!!!" options = { 1: ModelLogisticRegression, 2: NaiveBayes, 3: svm.SVC, 4: RandomForestClassifier } print "Enter Your Choice >>> " x = input() if x == 5: break elif x == 4: print "Classification on Random Forest Model using SKLearn Library" runModel(options[x](n_jobs=2, random_state=0)) elif x == 3: print "Classification on Support Vector Machine Model using SKLearn Library" parameters = { 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear'], 'random_state': [1] } svc = svm.SVC(kernel='linear', probability=True, random_state=0) roc_auc_scorer = make_scorer(roc_auc_score) modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer) runModel(modelObj) else: print "Classification on " + MODEL[x] runModel(options[x](PARAMS[x]))
def make_scorer_for_search(self, kwargs): if self.check_should_run_search(): custom_scorer = make_scorer( self.search_scoring.score, greater_is_better=self.search_scoring.greater_is_better, needs_proba=self.search_scoring.needs_proba, needs_threshold=False, **kwargs) else: custom_scorer = make_scorer( self.scorer.score, greater_is_better=self.scorer.greater_is_better, needs_proba=self.scorer.needs_proba, needs_threshold=False, **kwargs) return custom_scorer
def report(self, pipeline: AbstractPipeline): report_dict = defaultdict(list) for model in pipeline.get_models(): model_name = model.short_name custom_scorer = make_scorer( model.scorer.score, greater_is_better=model.scorer.greater_is_better, needs_proba=model.scorer.needs_proba, needs_threshold=False) try: cv = cross_val_score(model.best_model, pipeline.train, pipeline.train_y, verbose=self.verbose, scoring=custom_scorer, cv=self.cv_num, n_jobs=-1) report_dict['model_name'].append(model_name) report_dict['cross_val_score'].append(cv) except: print('Cross Val Failed: ' + model_name) report_df = pd.DataFrame(report_dict) folder = Configuration.get_cache_subfolder() path = pkg_resources.resource_filename( 'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/') pkg_resources.ensure_directory(path) report_df.to_csv(path + pipeline.dataset_tag + '_model_cross_val_report.csv')
def steam_learning_forest(data, NUM_FOLDS): """ Trains a random forest model using the given data. Uses K-Fold validation with NUM_FOLDS folds. A string describing the results is returned. Takes roughly 8 minutes to run. Number of trees was measured for time efficiency after the rate of decrease in the error diminished. At ~200, this peaks. If we choose arbitrarily larger, 1500 trees, we only achieve a decrease in the thousandths. """ trees = 200 forest_train = data[["positive_ratings_", "negative_ratings_", "owners_", "average_playtime_", "median_playtime_"]] forest_label = data[["price_"]] kfold = KFold(n_splits=NUM_FOLDS, random_state=None, shuffle=True) forest_regressor = RandomForestRegressor(n_estimators=trees, random_state=0) mse_scorer = make_scorer(mean_squared_error) results = cross_val_score(forest_regressor, forest_train, forest_label.values.ravel(), scoring=mse_scorer, cv=kfold) print(f"Random Forest - MSE Array: {results}") mean_overall = np.mean(results) final_results = f"Random Forest - Mean MSE over {NUM_FOLDS} folds: {mean_overall}" print(final_results) return final_results
def categorize(clf, target, data, njobs=6): """ Expects a pandas series and a pandas data frame. Both need to be indexed with the same index. """ from imblearn.pipeline import Pipeline from sklearn.metrics.scorer import make_scorer from sklearn.metrics import recall_score, precision_score from sklearn.utils.multiclass import type_of_target # Determine prediction target: y_type = type_of_target(target) if y_type == "multiclass": metrics = {"roc_auc": make_scorer(multiclass_roc, average="weighted")} else: metrics = ["roc_auc"] score = cross_validate( clf, data, target, cv=10, scoring=metrics, return_train_score=False, n_jobs=njobs, ) del score["fit_time"] del score["score_time"] score = {k: np.mean(v) for k, v in list(score.items())} print(score) return score
def fit_model(X_train, y_train): # 创建决策树模型 from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() from sklearn.cross_validation import KFold from sklearn.metrics import make_scorer from sklearn import grid_search from sklearn import metrics cross_validator = KFold(5) param_grid = {"max_depth":[4,5,6,7], # "min_samples_split": [30,20,40], # "min_samples_leaf":[10,20,30] } from sklearn.metrics import r2_score def performance_metric(y_test, y_pred): score = r2_score(y_test, y_pred) return score scoring_fnc = make_scorer(performance_metric) model = grid_search.GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=cross_validator, scoring=scoring_fnc, verbose=10) model.fit(X_train, y_train) print(model.best_estimator_) print(model.grid_scores_) print(model.best_params_) print(model.best_score_) return model.best_estimator_
def selectTasks(): while True: print "\nSelect the Model for classification:" print "Enter 1 : Logistic Regression" print "Enter 2 : Naive Bayes" print "Enter 3 : Support Vector Machne Model using SKlearn library" print "Enter 4 : Random Forest Model using SKlearn library" print "Enter 5 : To exit!!!!" options={1:ModelLogisticRegression, 2:NaiveBayes, 3:svm.SVC, 4:RandomForestClassifier} print "Enter Your Choice >>> " x=input() if x==5: break elif x==4: print "Classification on Random Forest Model using SKLearn Library" runModel(options[x](n_jobs=2,random_state=0)) elif x==3: print "Classification on Support Vector Machine Model using SKLearn Library" parameters={'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]} svc=svm.SVC(kernel = 'linear', probability = True, random_state = 0) roc_auc_scorer = make_scorer(roc_auc_score) modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer) runModel(modelObj) else: print "Classification on "+MODEL[x] runModel(options[x](PARAMS[x]))
def train_7(X_train_dev, y_train_dev, dev_size=0.1, n_folds=10): print( 'Model 7 - (MinMaxScaler) + RandomForestRegressor on log1p(n_clicks) with GridSearchCV' ) num_transformer = Pipeline([('normalizer', MinMaxScaler(feature_range=(0, 1)))]) regressor = RandomForestRegressor(random_state=0) model = Pipeline([('preprocessor', num_transformer), ('regressor', regressor)]) wmse_scorer = make_scorer(evaluate.wmse_log, greater_is_better=False) y_train_dev = np.log1p(y_train_dev) param_grid = { 'regressor__n_estimators': [50, 100, 200], 'regressor__max_features': [None, 1 / 3] } grid = GridSearchCV(model, param_grid, cv=10, scoring=wmse_scorer, iid=False, return_train_score=True, error_score=np.nan, n_jobs=-1, verbose=5) grid.fit(X_train_dev, y_train_dev) print('Best Parameters:', grid.best_params_) print('Validation WMSE:', round(-grid.best_score_, 5)) best_model = clone(grid.best_estimator_) best_model.fit(X_train_dev, y_train_dev) return best_model
def compute_SVR(train_x, train_y, test_x): # make MAE scoring MAE = make_scorer(compute_error, greater_is_better = False) ######### SVR - Polynomial/rbf Kernel ######### # make pipeline std_SVR = make_pipeline(StandardScaler(), SVR()) params = {'svr__kernel': ['poly', 'rbf'], 'svr__degree': [1, 2]} gs = GridSearchCV(estimator = std_SVR, param_grid = params, scoring = MAE, n_jobs=-1, cv = 5, return_train_score = True) # fit grid search gs.fit(train_x, train_y) print('SVR train score', -gs.cv_results_['mean_train_score']) print('SVR test score', -gs.cv_results_['mean_test_score']) print('Best Parameter', gs.best_params_) print('Best score', -gs.best_score_) print('Parameters', gs.cv_results_['params']) # Train the best Model best_SVR = make_pipeline(StandardScaler(), SVR(kernel='poly', degree=1)) best_SVR.fit(train_x, train_y) # Make Prediction test_y = best_SVR.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/SVR_best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def OptimizeClassifier(data, target, clf, grid, scores={'f1': make_scorer(f1)}, cv=10, refit='f1'): data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.3) for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(clf, grid, cv=cv, scoring=scores, refit=refit) clf.fit(data_train, target_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_f1'] stds = clf.cv_results_['std_test_f1'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.5f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = target_test, clf.predict(data_test) print(classification_report(y_true, y_pred)) print()
def check_cv(): df_cv = filter_one_vs_one(df_train,1,5) model = SVC(kernel="poly",degree=2,gamma=1,coef0=1) params = { "C": [.0001,.001,.01,.1,1] } scorer = make_scorer(calc_error,greater_is_better=False) scores = {} scores_list = {} for i in range(100): cv = KFold(n_splits=10,shuffle=True,random_state=i) gs = GridSearchCV(model,params,scoring=scorer,cv=cv) gs.fit(df_cv[predictors],df_cv.digit) best_param = gs.best_params_["C"] best_score = gs.best_score_ if best_param not in scores: scores[best_param] = 1 else: scores[best_param] += 1 if best_param not in scores_list: scores_list[best_param] = [] else: scores_list[best_param].append(best_score) print("Score count:") for k, v in scores.items(): print("C={}: {} with average score {:.3f}".format(k, v, pd.np.abs(pd.np.mean(scores_list[k]))))
def reducer_creation(df_final, target_column, reducer, dataset): X = df_final.loc[:, df_final.columns != target_column] Y = df_final.loc[:, df_final.columns == target_column] my_scorer = make_scorer(cluster_acc, greater_is_better=True) km = kmeans(random_state=5) gmm = GMM(random_state=5) components = np.linspace(2, len(X.columns) - 1, 5, dtype=np.int64, endpoint=True) estimators = [('reduce_dim', reducer), ('clf', km)] param_grid = [ dict(reduce_dim__n_components=components, clf__n_clusters=components) ] pipe = Pipeline(estimators) grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer) grid_search.fit(X, Y) mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape( len(components), -1, len(components)) plotReducerAndCluster(mean_scores, components) estimators = [('reduce_dim', reducer), ('clf', gmm)] param_grid = [ dict(reduce_dim__n_components=components, clf__n_components=components) ] pipe = Pipeline(estimators) grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer) grid_search.fit(X, Y) mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape( len(components), -1, len(components)) plotReducerAndCluster(mean_scores, components)
def runTPOT(X, y, metric, algo): aml_config_dict = aml_config() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) if algo == "Classifier": pipeline_optimizer = TPOTClassifier(generations=1, population_size=5, verbosity=2, warm_start=True) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) elif algo == 'Regressor': def aml_reg_scorer(y_pred, y_test): rsme = sqrt(mean_squared_error(y_test, y_pred)) return rsme aml_custom_scorer = make_scorer(aml_reg_scorer, greater_is_better=False) pipeline_optimizer = TPOTRegressor(generations=1, population_size=5, verbosity=2, warm_start=True, scoring=aml_custom_scorer) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) else: raise Exception('Incorrect Problem Type') return pipeline_optimizer, pipeline_optimizer.score(X_test, y_test), len( pipeline_optimizer.evaluated_individuals_)
def __init__(self, algorithm, params=None): ''' Initialize the class with a list of possible algorithms and recommended hyperparameter ranges ''' if algorithm == 'etr': # Extra trees regressor from sklearn.ensemble import ExtraTreesRegressor self.hyper_range = { "max_depth": [4, 8, 12, 16, 20], "min_samples_split": np.arange(2, 11), "min_samples_leaf": np.arange(1, 11), "n_estimators": np.arange(10, 801, 40) } self.algorithm = ExtraTreesRegressor() elif algorithm == 'gbm': # Gradient boosting model from sklearn.ensemble import GradientBoostingRegressor self.hyper_range = { "max_depth": [4, 8, 12, 16, 20], "min_samples_split": np.arange(2, 11), "min_samples_leaf": np.arange(1, 11), "n_estimators": np.arange(10, 801, 40) } self.algorithm = GradientBoostingRegressor() elif algorithm == 'gam': # Generalized additive model from pygam import GAM self.hyper_range = {'n_splines': np.arange(5, 40)} self.algorithm = GAM() # Set scorer as R2 self.my_scorer = make_scorer(r2_score, greater_is_better=True)
def steam_learning_boosting(data, NUM_FOLDS): """ Ensemble AdaBoosting to boost over each fold Uses K-Fold validation with NUM_FOLDS folds. A string describing the results is returned. Number of trees was measured for time efficiency after the rate of decrease in the error diminished. At ~200, this peaks. If we choose arbitrarily larger, 1500 trees, we only achieve a decrease in the thousandths. Seed set for predictable results """ trees = 200 X = data[["positive_ratings_", "negative_ratings_", "owners_", "average_playtime_", "median_playtime_"]] y = data[["price_"]] kfold = KFold(n_splits=NUM_FOLDS) model = AdaBoostRegressor(n_estimators=trees) mse_scorer = make_scorer(mean_squared_error) results = cross_val_score(model, X, y.values.ravel(), scoring=mse_scorer, cv=kfold) print(f"Boosting - MSE Array: {results}") final_results = f"Boosting - Mean MSE over {NUM_FOLDS} folds: {np.mean(results)}" print(final_results) return(final_results)
def model(X,y,z): import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn import linear_model from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor import xgboost as xgb from sklearn.model_selection import RandomizedSearchCV from xgboost import XGBRegressor from sklearn.metrics.scorer import make_scorer def rmse_eval(y, y0): error = np.sqrt(np.mean(np.power(y-y0, 2))) return error my_scorer = make_scorer(rmse_eval, greater_is_better=True) par_rf = {'n_estimators': [100, 150, 200,300], 'max_depth' : [3,6,9,12]} par_dt = {'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [3,6,9,12]} par_xg = {'n_estimators': [100, 150, 200,300], 'max_depth' : [3,6,9,12]} model1 = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=par_rf, cv= 10, n_iter=1,scoring=my_scorer) model2 = RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=42), param_distributions=par_dt, cv= 10, n_iter=1,scoring=my_scorer) model3 = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=42), param_distributions=par_xg, cv= 5, n_iter=10,scoring=my_scorer) # Fit it to the data model1.fit(X, y) model2.fit(X, y) model3.fit(X, y) #store preds on test and train data preds1 = model1.predict(X) preds2 = model2.predict(X) preds3 = model3.predict(X) test_preds1 = model1.predict(z) test_preds2 = model2.predict(z) test_preds3 = model3.predict(z) print(X.values) print(preds1) #store predictions stacked_predictions = np.column_stack((preds1,preds2,preds3)) stacked_test_predictions = np.column_stack((test_preds1,test_preds2,test_preds3)) print(stacked_predictions) #Fit & predict with the meta model meta_model = linear_model.LinearRegression() meta_model.fit(stacked_predictions,y) final_predictions = np.expm1(meta_model.predict(stacked_test_predictions)) df2 = pd.DataFrame(data=[]) df2['true'] = np.expm1(meta_model.predict(stacked_predictions)) df2['pred_rf'] = np.expm1(y) df2[['true','pred_rf']].plot() print('Train score of model1:',rmse_eval(y, preds1)) print('Train score of model2:',rmse_eval(y, preds2)) print('Train score of model3:',rmse_eval(y, preds3)) print('Train score of stacked model:',rmse_eval(y, meta_model.predict(stacked_predictions))) return final_predictions
def crossValidatedScores(data, target, hlayers, clf): data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.3) clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=hlayers, random_state=1) scoring = {'tp': make_scorer(metrics.tp), 'tn': make_scorer(metrics.tp), 'fp': make_scorer(metrics.fp), 'fn': make_scorer(metrics.fn), 'f1': make_scorer(metrics.f1), 'precision': make_scorer(metrics.precision), 'sensitivity': make_scorer(metrics.sensitivity), 'specificity': make_scorer(metrics.specificity)} #'ROC': make_scorer(roc)} results = cross_validate(clf.fit(data_train, target_train), data_test, target_test, scoring=scoring, cv=10) return results
def convert_sklearn_metric_function(scoring): if callable(scoring): module = getattr(scoring, '__module__', None) if (hasattr(module, 'startswith') and module.startwith('sklearn.metrics.') and not module.startwith('sklearn.metrics.scorer') and not module.startwith('sklearn.metrics.tests')): return make_scorer(scoring) return scoring
def train_9(X_train_dev, y_train_dev, dev_size=0.1, n_folds=10): print( 'Model 9 - (MinMaxScaler) + (OneHotEncoder), XGBRegressor on log1p(n_clicks) with GridSearchCV' ) cat_transformer = Pipeline([('encoder', OneHotEncoder(categories='auto', handle_unknown='ignore'))]) num_transformer = Pipeline([('normalizer', MinMaxScaler(feature_range=(0, 1)))]) no_transformer = Pipeline([('transformer', DummyTransformer())]) preprocessor = ColumnTransformer([('cat', cat_transformer, [4]), ('num', num_transformer, [1, 2, 5, 6, 7]), ('no', no_transformer, [0, 3, 8, 9, 10])]) regressor = xgb.XGBRegressor() model = Pipeline([('preprocessor', preprocessor), ('regressor', regressor)]) wmse_scorer = make_scorer(evaluate.wmse_log, greater_is_better=False) y_train_dev = np.log1p(y_train_dev) param_grid = { 'regressor__n_estimators': [500], 'regressor__learning_rate': [.005, 0.01, .03, .05], 'regressor__max_depth': [5, 7, 9, 11], 'regressor__n_jobs': [-1], 'regressor__random_state': [0] } cv = KFold(n_splits=n_folds, shuffle=True, random_state=0) grid = GridSearchCV(model, param_grid, cv=cv, scoring=wmse_scorer, iid=False, return_train_score=True, error_score=np.nan, n_jobs=-1, verbose=3) X_xgb_train_dev, X_xgb_test, y_xgb_train_dev, y_xgb_test = train_test_split( X_train_dev, y_train_dev, test_size=dev_size, shuffle=True, random_state=0) fit_params = { 'regressor__early_stopping_rounds': 10, 'regressor__sample_weight': 1 + y_xgb_train_dev, 'regressor__sample_weight_eval_set': [1 + y_xgb_test], 'regressor__eval_metric': wmse_log_xgb, 'regressor__eval_set': [[X_xgb_test, y_xgb_test]] } grid.fit(X_xgb_train_dev, y_xgb_train_dev, **fit_params) print('Best Parameters:', grid.best_params_) print('Validation WMSE:', round(-grid.best_score_, 5)) best_model = clone(grid.best_estimator_) best_model.fit(X_train_dev, y_train_dev) return best_model
def crossValidatedScores(data, target, clf, cv=3): scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn), 'fp': make_scorer(fp), 'fn': make_scorer(fn), 'accuracy': make_scorer(accuracy), 'f1': make_scorer(f1), 'precision': make_scorer(precision), 'sensitivity': make_scorer(sensitivity), 'specificity': make_scorer(specificity)} #'ROC': make_scorer(roc)} results = cross_validate(clf, data, target, scoring=scoring, cv=cv, return_train_score=False) return results
def __init__(self, X, Y): self.X = X self.Y = Y self.scoring = { # 'auc_score_macro': make_scorer(self.roc_auc_macro), # 'auc_score_micro': make_scorer(self.roc_auc_micro), 'accuracy': 'accuracy', 'scores_p_1': 'precision', 'scores_r_1': 'recall', 'scores_f1_1': 'f1', 'scores_f1_0': make_scorer(self.f1_0), 'scores_p_0': make_scorer(self.precision_0), 'scores_r_0': make_scorer(self.recall_0), 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro', 'recall_micro': 'recall_micro', 'f1_macro': 'f1_macro', 'f1_micro': 'f1_micro' }
def test_model(**params): model.set_params(**params) scores = cross_val_score(model, X_full, y_full, cv=ShuffleSplit(n_splits=1, test_size=0.10, random_state=42), scoring=make_scorer(r2_score)) r2_now = np.mean(scores) if r2_now < 0: r2_now =0 return np.sqrt(r2_now)
def multiGridSearch(filename,classifiers,classNames, parameters, crossVal, Nfrac,nTests,test_set_fraction,plotResults=False, impute_scale= True, parallel=False): allResults=[] best=0 bestEstim= None bestEstimName=None scorer=make_scorer(matthews_corrcoef) print("Grid search on %s fraction of dataset" % Nfrac) print("_" * 10) print("\n"*5) for i in range(len(classifiers)): classif= classifiers[i] className=classNames[i] param=parameters[i] results=[] print("Evaluating performance for classifier: %s" % className) for j in range(nTests) : print("Test number %d for %s:" % (j+1, className)) X_train, y_train, X_test, y_test, test_id = importData_chunks(filename,Nfrac,test_set_fraction) if impute_scale: X_train, X_test= imputeAndScale(X_train,X_test) if parallel: clf=GridSearchCV(classif,param,scoring=scorer,cv=crossVal, verbose=1, n_jobs=-1) y_pred= GridSearch(clf, X_train, y_train, X_test) perf = evaluate(y_pred,y_test) results.append(perf) else: clf=GridSearchCV(classif,param,scoring=scorer,cv=crossVal, verbose=1) print(X_train.shape) print(y_train.shape) print(X_test.shape) y_pred= GridSearch(clf, X_train, y_train, X_test) print(y_pred.shape) perf = evaluate(y_pred,y_test) if(className == "SGB"): print(clf.feature_importances) results.append(perf) if perf > best: bestEstim = clf bestEstimName=className del X_test del X_train del y_train del y_test allResults.append(results) print("_" * 10) print("\n"*5) print("Best results overall:") print("Best classifier:%s" % bestEstimName) print("Best parameters:") print(bestEstim.best_params_) return allResults, bestEstim
def cross_val_full_scores(clf, df, y, cv=10): scoring = { 'prec_macro': 'precision_macro', 'rec_micro': make_scorer(recall_score, average='macro') } scores = cross_validate(clf, df, y, scoring=scoring, cv=10, return_train_score=True)
def convert_sklearn_metric_function(scoring): """If ``scoring`` is a sklearn metric function, convert it to a sklearn scorer and return it. Otherwise, return ``scoring`` unchanged.""" if callable(scoring): module = getattr(scoring, '__module__', None) if (hasattr(module, 'startswith') and module.startswith('sklearn.metrics.') and not module.startswith('sklearn.metrics.scorer') and not module.startswith('sklearn.metrics.tests.')): return make_scorer(scoring) return scoring
def test_model(**params): model.set_params(**params) scores = cross_val_score(model, X_full, y_full, cv=ShuffleSplit(n_splits=1, test_size=0.1, random_state=42), scoring=make_scorer(r2_score)) r2_test = np.mean(scores) return r2_test
def cv(data_x, data_y, clf): ''' 对输入模型进行 K-Folder 交叉验证,返回验证结果 :param data_x: 训练向量 :param data_y: 训练标签 :param clf: 分类器模型 :return: 验证结果:各种评估得分 ''' scoring = { 'precision_macro': 'precision_macro', 'recall_macro': make_scorer(metrics.recall_score, average='macro'), 'roc_auc_macro': make_scorer(metrics.roc_auc_score, average='macro'), 'f1_macro': make_scorer(metrics.f1_score, average="macro"), 'accuracy': make_scorer(metrics.accuracy_score), } cv_results = cross_validate(clf, data_x, data_y, scoring=scoring, n_jobs=4, cv=N_SPLIT, return_train_score=False, ) for key in cv_results.keys(): print(f"{key}:\t{np.mean(cv_results[key])}") # print("----------") return cv_results
def _crossValidate(self, y_train, X_train, refit=False): # Run the grid search print "Cross-validating for", self.numFolds, "folds" print "Args", self.classifierArgs cv = StratifiedKFold(y_train, n_folds=self.numFolds, shuffle=True, random_state=1) #self.getCV(y_train, self.meta.meta, numFolds=self.numFolds) #cv = BalancedIteratorCV(y_train, n_folds=self.numFolds, shuffle=True, random_state=1, examples=[x for x in self.meta.db.query("SELECT * from example WHERE [set] == 'train';")], groupBy="project_code") classifier, classifierArgs = self._getClassifier() metric = self.metric if metric == "bas": metric = make_scorer(balanced_accuracy_score) search = ExtendedGridSearchCV(classifier(), classifierArgs, refit=refit, cv=cv, scoring=metric, verbose=self.verbose, n_jobs=self.parallel, pre_dispatch=int(self.preDispatch) if self.preDispatch.isdigit() else self.preDispatch) search.fit(X_train, y_train) print "---------------------- Grid scores on development set --------------------------" results = [] index = 0 bestExtras = None bestScores = None for params, mean_score, scores in search.grid_scores_: print "Grid:", params results.append(self._getResult("train", classifier, cv, params, None, None, mean_score, scores, extra={"train_size":None, "test_size":None})) if bestScores == None or float(mean_score) > bestScores[1]: bestScores = (params, mean_score, scores) if hasattr(search, "extras_"): bestExtras = search.extras_[index] for fold in range(len(scores)): result = self._getResult("train", classifier, cv, params, scores[fold], fold) if hasattr(search, "extras_"): for key in search.extras_[index][fold].get("counts", {}).keys(): result[key + "_size"] = search.extras_[index][fold]["counts"][key] results.append(result) if hasattr(search, "extras_") and self.classes and len(self.classes) == 2: print ["%0.8f" % x for x in self._validateExtras(search.extras_[index], y_train)], "(eval:auc)" print scores, "(" + self.metric + ")" print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) index += 1 print "---------------------- Best scores on development set --------------------------" params, mean_score, scores = bestScores print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) baselines = self._calculateBaseline(cv, y_train) print "MCB = %0.3f (+/-%0.03f) for" % (np.mean(baselines), np.std(baselines) / 2), ["%0.3f" % x for x in baselines], "(" + self.metric + ")" print "--------------------------------------------------------------------------------" # Save the grid search results print "Saving results" self._insert("result", results) self._saveExtras(bestExtras, "train") self.meta.flush() return search
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer from scipy.stats import randint iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(n_components=2)), ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))]) params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__n_components": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50,60), "rf__max_depth": randint(4,8), "rf__min_rows": randint(5,10),} custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:],iris[0]) print random_search.best_estimator_
pearson_data(datasetName, model_stacking_models) #print('normalizing!') #X = normalizer.fit_transform(X) # LABELING # labelProp = sksemi.label_propagation.LabelSpreading(kernel='rbf', gamma=150, n_neighbors=3, alpha=0.15, max_iter=600000, tol=0.001) # print('fitting label spreader') # labelProp.fit(X, Y) # print('predicting labels for Y') # Y=labelProp.transduction_ # print('Shape of Y:', Y.shape) # print('first row: ', Y[0]) # SCORER scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False) # PREPROCESSING # SCALING minMaxScaler = MinMaxScaler(feature_range=(0.0,1.0)) #normalizer = skprep.Normalizer() columnDeleter = fs.FeatureDeleter() # FEATURE SELECTION varianceThresholdSelector = VarianceThreshold(threshold=(0)) percentileSelector = SelectPercentile(score_func=f_classif, percentile=20) kBestSelector = SelectKBest(f_classif, 1000) # FEATURE EXTRACTION #rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)]) nmf = NMF(n_components=150)
## Create features and labels my_dataset = df.T.to_dict('dict') data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ## Define scoring method to return ## f1 when recall and precision are > 30 def score_func(y_true, y_pred, **kwargs): r = recall_score(y_true, y_pred, **kwargs) p = precision_score(y_true, y_pred, **kwargs) if r > 0.30 and p > 0.30: return f1_score(y_true, y_pred, **kwargs) else: return 0 scorer = make_scorer(score_func) clf = Pipeline(steps=[ # ('scaler', MinMaxScaler()), # ('features', FeatureUnion([ # ('ngram_tf_idf', Pipeline([ # ('kbest', SelectKBest(k=5, score_func=f_classif)), # ('lda', LDA(n_components=1, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)), # ('kmeans', MiniBatchKMeans(n_clusters=20, n_init=10, max_no_improvement=10, verbose=0)), # ]))])), ('kbest', SelectKBest(k=5, score_func=f_classif)), ('lda', LDA(n_components=1, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)), ('kmeans', MiniBatchKMeans(n_clusters=20, n_init=10, max_no_improvement=10, verbose=0)), ('classifier', GaussianNB()) ])
import math def MAPE_scorer(y, y_pred): error = 0 num = len(y) for i in range(0, num): if y[i] > 0: error += math.fabs(y_pred[i] - y[i]) / y[i] #print('error, num:', error, num) if num > 0: return error / num else: return 0 my_scorer = make_scorer(MAPE_scorer) # y = df['gap'][:21*66*144] # MAPE_scorer(y, [1 for i in range(0, 21*66*144)]) def cal_dist(dist): global cv_pred_all, cv_real_all # split training, CV, test set df = pd.read_csv("data/season_1/features/"+ str(dist)+".csv") df["date"] = df["date"].apply(lambda x: pd.to_datetime(x, errors='coerce')) training = df.loc[(df.date < '2016-01-17') | (df.date == '2016-01-18')] training_time = training.loc[df.time_slice.isin([46, 58, 70, 82, 94, 106, 118, 130, 142])] cv = df.loc[df['date'].isin(['2016-01-17','2016-01-19','2016-01-20', '2016-01-21'])] cv_time = cv.loc[df.time_slice.isin([46, 58, 70, 82, 94, 106, 118, 130, 142])] # only catch time slice in test set
def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, warm_start=False, verbosity=0, disable_update_check=False): """Set up the genetic programming algorithm for pipeline optimization. Parameters ---------- generations: int, optional (default: 100) Number of iterations to the run pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. population_size: int, optional (default: 100) Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. offspring_size: int, optional (default: None) Number of offspring to produce in each GP generation. By default, offspring_size = population_size. mutation_rate: float, optional (default: 0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate: float, optional (default: 0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to "breed" every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. scoring: string or callable, optional Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification problems and mean squared error (MSE) for regression problems. Offers the same options as sklearn.model_selection.cross_val_score as well as a built-in score 'balanced_accuracy'. Classification metrics: ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] Regression metrics: ['neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'] If you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred). See the section on scoring functions in the documentation for more details. TPOT assumes that any custom scoring function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized. cv: int or cross-validation generator, optional (default: 5) If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. subsample: float, optional (default: 1.0) Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. n_jobs: int, optional (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. max_time_mins: int, optional (default: None) How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the "generations" parameter and allow TPOT to run until it runs out of time. max_eval_time_mins: int, optional (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines, but will also allow TPOT to run longer. random_state: int, optional (default: None) Random number generator seed for TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict: a Python dictionary or string, optional (default: None) Python dictionary: A dictionary customizing the operators and parameters that TPOT uses in the optimization process. For examples, see config_regressor.py and config_classifier.py Path for configuration file: A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. For examples, see config_regressor.py and config_classifier.py String 'TPOT light': TPOT uses a light version of operator configuration dictionary instead of the default one. String 'TPOT MDR': TPOT uses a list of TPOT-MDR operator configuration dictionary instead of the default one. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). verbosity: int, optional (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. disable_update_check: bool, optional (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError('Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.') # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._pareto_front = None self._optimized_pipeline = None self.fitted_pipeline_ = None self._fitted_imputer = None self._pop = None self.warm_start = warm_start self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Set offspring_size equal to population_size by default if offspring_size: self.offspring_size = offspring_size else: self.offspring_size = population_size self._setup_config(config_dict) self.operators = [] self.arguments = [] for key in sorted(self.config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self.config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType ) if op_class: self.operators.append(op_class) self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a # run-time limit TPOT will automatically interrupt itself when the timer # runs out if max_time_mins is not None: self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate if self.mutation_rate + self.crossover_rate > 1: raise ValueError( 'The sum of the crossover and mutation probabilities must be <= 1.0.' ) self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'StackingEstimator': StackingEstimator, 'FunctionTransformer': FunctionTransformer, 'copy': copy } self._pbar = None # Dictionary of individuals that have already been evaluated in previous # generations self.evaluated_individuals_ = {} self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn # SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: if scoring not in SCORERS: raise ValueError( 'The scoring function {} is not available. Please ' 'choose a valid scoring function from the TPOT ' 'documentation.'.format(scoring) ) self.scoring_function = scoring self.cv = cv self.subsample = subsample if self.subsample <= 0.0 or self.subsample > 1.0: raise ValueError( 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module if sys.platform.startswith('win') and n_jobs != 1: print( 'Warning: Although parallelization is currently supported in ' 'TPOT for Windows, pressing Ctrl+C will freeze the optimization ' 'process without saving the best pipeline! Thus, Please DO NOT ' 'press Ctrl+C during the optimization procss if n_jobs is not ' 'equal to 1. For quick test in Windows, please set n_jobs to 1 ' 'for saving the best pipeline in the middle of the optimization ' 'process via Ctrl+C.' ) if n_jobs == -1: self.n_jobs = cpu_count() else: self.n_jobs = n_jobs self._setup_pset() self._setup_toolbox()
def __init__(self, population_size=100, generations=100, mutation_rate=0.9, crossover_rate=0.05, scoring=None, num_cv_folds=3, max_time_mins=None, max_eval_time_mins=5, random_state=None, verbosity=0, disable_update_check=False): """Sets up the genetic programming algorithm for pipeline optimization. Parameters ---------- population_size: int (default: 100) The number of pipelines in the genetic algorithm population. Must be > 0.The more pipelines in the population, the slower TPOT will run, but it's also more likely to find better pipelines. generations: int (default: 100) The number of generations to run pipeline optimization for. Must be > 0. The more generations you give TPOT to run, the longer it takes, but it's also more likely to find better pipelines. mutation_rate: float (default: 0.9) The mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to apply random changes to every generation. We don't recommend that you tweak this parameter unless you know what you're doing. crossover_rate: float (default: 0.05) The crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to "breed" every generation. We don't recommend that you tweak this parameter unless you know what you're doing. scoring: function or str Function used to evaluate the quality of a given pipeline for the problem. By default, balanced class accuracy is used for classification problems, mean squared error for regression problems. TPOT assumes that this scoring function should be maximized, i.e., higher is better. Offers the same options as sklearn.model_selection.cross_val_score: ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] num_cv_folds: int (default: 3) The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process max_time_mins: int (default: None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the `generations` parameter. max_eval_time_mins: int (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines but will also allow TPOT to run longer. random_state: int (default: 0) The random number generator seed for TPOT. Use this to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. verbosity: int (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = all disable_update_check: bool (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError('Do not instantiate the TPOTBase class directly; ' 'use TPOTRegressor or TPOTClassifier instead.') # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._hof = None self._optimized_pipeline = None self._fitted_pipeline = None self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Schedule TPOT to run for a very long time if the user specifies a run-time # limit TPOT will automatically interrupt itself when the timer runs out if not (max_time_mins is None): self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'VotingClassifier': VotingClassifier, 'FunctionTransformer': FunctionTransformer } self._pbar = None self._gp_generation = 0 self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ if 'loss' in scoring_name or 'error' in scoring_name: greater_is_better = False else: greater_is_better = True SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: self.scoring_function = scoring self.num_cv_folds = num_cv_folds self._setup_pset() self._setup_toolbox()
("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))]) pipeline.fit(iris_df[:4],iris_df[4]) # Random CV using H2O and Scikit-learn from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": [2,3], "gbm__ntrees": [10,20], "gbm__max_depth": [1,2,3], "gbm__learn_rate": [0.1,0.2]} custom_cv = H2OKFold(iris_df, n_folds=5, seed=42) pipeline = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="gaussian"))]) random_search = RandomizedSearchCV(pipeline, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris_df[1:], iris_df[0]) print random_search.best_estimator_