def model(X_train, Y_train, X_test, Y_test): model = layers.Sequential() model.add(layers.Dense(512, input_shape=(784, ))) model.add(layers.Activation('relu')) model.add(layers.Dropout({{hyperopt.uniform(0, 1)}})) model.add(layers.Dense({{hyperopt.choice([256, 512, 1024])}})) model.add(layers.Activation({{hyperopt.choice(['relu', 'sigmoid'])}})) model.add(layers.Dropout({{hyperopt.uniform(0, 1)}})) # If we choose 'four', add an additional fourth layer if {{hyperopt.choice(['three', 'four'])}} == 'four': model.add(layers.Dense(100)) model.add({{ hyperopt.choice([layers.Dropout(0.5), layers.Activation('linear')]) }}) model.add(layers.Activation('relu')) model.add(layers.Dense(10)) model.add(layers.Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer={{hyperopt.choice(['rmsprop', 'adam', 'sgd'])}}, metrics=['accuracy']) model.fit(X_train, Y_train, batch_size={{hyperopt.choice([64, 128])}}, nb_epoch=1, verbose=2, validation_data=(X_test, Y_test)) score, acc = model.evaluate(X_test, Y_test, verbose=0) print('Test accuracy:', acc) return {'loss': -acc, 'status': hyperopt.STATUS_OK, 'model': model}
def train(df, experiment_name, run_name): mlflow.set_experiment(experiment_name) data = df.toPandas() X_train, X_test, y_train, y_test = train_test_split(data.drop(["quality"], axis=1), data[["quality"]].values.ravel(), random_state=42) search_space = { 'n_estimators': hp.uniform('n_estimators', 10, 100), 'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 20), 'max_depth': hp.uniform('max_depth', 2, 10), } spark_trials = SparkTrials(parallelism=4) with mlflow.start_run(run_name=run_name): fmin( fn=evaluate_hyperparams_wrapper(X_train, X_test, y_train, y_test), space=search_space, algo=tpe.suggest, max_evals=10, trials=spark_trials, )
early_stopping_rounds = 100, verbose = 200) # prediction pred = model.predict_proba(y_val) eval_df = prepare_eval_df() # scoring model score = macro_lrap(eval_df) return {'score' : -score, 'status' : hp.STATUS_OK} # initial hyper parameters space space = dict() space['n_estimator'] = hp.quniform('n_estimators', 100, 2000, 1) space['max_depth'] = hp.uniform('max_depth', 2, 20, 1) space['learning_rate'] = hp.loguniform('learning_rate', -5, 0) # trials for logging information trials = hp.Trials() # max evaluation round max_eval = 50 # running optimisation best = hp.fmin( fn = objective, space = space, algo = tpe.suggest, max_evals = max_evals, trials = trials
print("TP = {}".format(TP)) print("FP = {}".format(FP)) print("FN = {}".format(FN)) f1 = 2. * TP / (2. * TP + FP + FN) print("F1 : ", f1) return {'loss': 1 - f1, 'status': STATUS_OK} space = { 'n_estimators': hp.choice('n_estimators', np.arange(200, 501, 25, dtype=int)), 'max_depth': hp.choice('max_depth', np.arange(15, 20, dtype=int)), 'max_features': hp.choice('max_features', np.arange(15, 30, dtype=int)), 'mss': hp.choice('mss', np.arange(2, 40, 1, dtype=int)), 'cw': hp.uniform('cw', 1, 5), 'msl': hp.choice('msl', np.arange(1, 11, dtype=int)) } trials = Trials() best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials) pprint(hp.space_eval(space, best)) best_pars = hp.space_eval(space, best)
mlflow.set_experiment(experiment_name) raw_data = spark.read.format("csv").option("header", "true").option( "sep", ";").load(input_data_path) features = engineer_features(raw_data) data = rename_columns(features).toPandas() X_train, X_test, y_train, y_test = train_test_split(data.drop(["quality"], axis=1), data[["quality" ]].values.ravel(), random_state=42) search_space = { "n_estimators": hp.uniform("n_estimators", 10, 500), "min_samples_leaf": hp.uniform("min_samples_leaf", 1, 20), "max_depth": hp.uniform("max_depth", 2, 10), } spark_trials = SparkTrials(parallelism=4) with mlflow.start_run(run_name=parent_run_name): fmin( fn=evaluate_hyperparams_wrapper(X_train, X_test, y_train, y_test), space=search_space, algo=tpe.suggest, max_evals=10, trials=spark_trials, )
import hyperopt as hp from hyperopt import hp, fmin, rand, tpe, space_eval # 定义目标函数 def q(args): x, y = args return x**2 + y**2 # 定义配置空间 space = [hp.uniform('x', -1, 1), hp.normal('y', -1, 1)] # 选择一个搜索算法 best = fmin(q, space, algo=tpe.suggest, max_evals=100) print(best) print(space_eval(space, best)) import pickle import time from hyperopt import STATUS_OK def objective(x): return {'loss': x**2, 'status': STATUS_OK} best = fmin(objective, space=hp.uniform('x', -10, 10), algo=tpe.suggest, max_evals=100) print(best)
plt.figure() plt.plot(test_AUC_list, label='test_AUC') plt.show() return -best_AUC # =========use library "hyperopt" to finetune the hyerparameters============== from hyperopt import fmin, tpe, hp, partial batch_list = [32, 64, 128] for dist in [5.]: space = { "lr_rate": hp.uniform("lr_rate", 0.0005, 0.01), "dp_out": hp.uniform("dp_out", 0.5, 1), "bt_size": hp.choice("bt_size", batch_list), "distance": hp.choice("distance", [dist]) } # algo = partial(tpe.suggest, n_startup_jobs=10) try: best = fmin(main, space, algo=tpe.suggest, max_evals=50) best["bt_size"] = batch_list[best["bt_size"]] best["distance"] = dist best_AUC = -main(best) with open('finetune.txt', 'a') as f: f.write( "At distance {}, the best AUC is {}, its lr_rate is {}, drop_out is {}, batch_size is {}\n\n" .format(dist, best_AUC, best["lr_rate"], best["dp_out"], best["bt_size"]))
if booster == "gbtree": pred_test = model.predict(X_test) elif booster == "dart": pred_test = model.predict(X_test, ntree_limit = num_round) error= MSE(y_test,pred_test) r2=-r2_score(y_train,model.predict(X_train)) return float(error) # DEFINING SEARCH SPACE search_space = {'booster': hp.choice('booster', ['gbtree',"dart"]), 'n_estimators': hp.quniform('n_estimators', 50, 3000, 1), 'eta': hp.uniform('eta', 0, 1), 'gamma': hp.uniform('gamma', 1, 500), 'max_depth': hp.quniform('max_depth', 3, 100, 1), 'min_child_weight': hp.uniform('min_child_weight', 0, 100), 'random_state': sample(scope.int(hp.quniform('random_state', 4, 8, 1))), 'subsample': hp.uniform('subsample', 0, 1), 'alpha': hp.uniform('alpha', 1, 8), 'colsample_bytree': hp.uniform('colsample_bytree', 0, 1), 'sample_type': hp.choice('sample_type', ['uniform', 'weighted']), 'normalize_type': hp.choice('normalize_type', ['tree', 'forest']), 'grow_policy': hp.choice('grow_policy', ['depthwise', 'lossguide']), 'rate_drop': hp.uniform('rate_drop', 0, 1), 'skip_drop': hp.uniform('skip_drop', 0, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0, 1), 'colsample_bynode': hp.uniform('colsample_bynode', 0, 1), 'reg_lambda': hp.uniform('reg_lambda', 1, 8)}
def lgb_tuning(lgb_cv,N_FOLDS=5,MAX_EVALS=100,output_file='bayes_test.csv',metric='auc',objection='binary',groups=None): def objective(hyperparameters,groups=groups): # Keep track of evals ITERATION =0 # Using early stopping to find number of trees trained if 'n_estimators' in hyperparameters: del hyperparameters['n_estimators'] # Retrieve the subsample subsample = hyperparameters['boosting_type'].get('subsample', 1.0) # Extract the boosting type and subsample to top level keys hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type'] hyperparameters['subsample'] = subsample # Make sure parameters that need to be integers are integers for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples','max_depth']: hyperparameters[parameter_name] = int(hyperparameters[parameter_name]) hyperparameters['objective']=objection #hyperparameters['verbose']=-1 start = timer() # Perform n_folds cross validation if groups: groups=lgb_cv.get_group() folds=GroupKFold().split(lgb_cv.get_label(),groups=groups) else: folds=None if metric.lower()=='map': hyperparameters['eval_at']=1 cv_results = lgb.cv(hyperparameters, lgb_cv, num_boost_round = 4000, nfold = N_FOLDS,folds=folds,\ early_stopping_rounds=300, metrics = metric) run_time = timer() - start score_key=sorted(cv_results.keys())[0] # Extract the best score best_score = cv_results[score_key][-1] # Loss must be minimized if metric=='binary_error': loss=best_score else: loss = 1 - best_score # Boosting rounds that returned the highest cv score n_estimators = len(cv_results[score_key]) # Add the number of estimators to the hyperparameters hyperparameters['n_estimators'] = n_estimators # Write to the csv file ('a' means append) of_connection = open(OUT_FILE, 'a') writer = csv.writer(of_connection) writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score]) of_connection.close() # Dictionary with information for evaluation return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'train_time': run_time, 'status': STATUS_OK} # Define the search space space = { 'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}, {'boosting_type': 'goss', 'subsample': 1.0}]), 'num_leaves': hp.quniform('num_leaves', 20, 200, 4), 'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.5)), #'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000), 'min_child_samples': hp.quniform('min_child_samples', 20, 300, 5), 'reg_alpha': hp.uniform('reg_alpha', 0.0, 0.2), 'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.2), #'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0), 'is_unbalance': hp.choice('is_unbalance', [True, False]), 'max_depth': hp.quniform('max_depth', 4, 8, 1) } # Create the algorithm tpe_algorithm = tpe.suggest # Record results trials = Trials() # Create a file and open a connection OUT_FILE = output_file of_connection = open(OUT_FILE, 'w') writer = csv.writer(of_connection) # Write column names headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score'] writer.writerow(headers) of_connection.close() #global ITERATION ITERATION = 0 # Run optimization best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials, max_evals = MAX_EVALS) return best