def test_deadline_stopper(): deadline = DeadlineStopper(0.0001) gp_minimize(bench3, [(-1.0, 1.0)], callback=deadline, n_calls=10, random_state=1) assert len(deadline.iter_time) == 1 assert np.sum(deadline.iter_time) > deadline.total_time deadline = DeadlineStopper(60) gp_minimize(bench3, [(-1.0, 1.0)], callback=deadline, n_calls=10, random_state=1) assert len(deadline.iter_time) == 10 assert np.sum(deadline.iter_time) < deadline.total_time
def main(): hyperparameters = { 'kernelSize1': np.arange(2, 10), 'stride1': np.arange(1, 5), 'dropout1': np.linspace(0.0, 0.8), 'kernelSize2': np.arange(2, 10), 'stride2': np.arange(1, 5), 'dropout2': np.linspace(0.0, 0.8), 'learningRate': np.linspace(0.001, 0.1) } hyperspace = HyperSpace(hyperparameters) all_intervals = hyperspace.fold_space() hyperspaces = hyperspace.hyper_permute(all_intervals) subspace_keys, subspace_boundaries = hyperspace.format_hyperspace( hyperspaces) space = subspace_boundaries[0] deadline = DeadlineStopper(18000) # Gaussian process minimization (see scikit-optimize skopt module for other optimizers) res_gp = gp_minimize(objective, space, n_calls=50, callback=deadline, random_state=0, verbose=True)
def main(): if rank == 0: hyperparameters = { 'kernelSize1': np.arange(2, 10), 'stride1': np.arange(1, 5), 'dropout1': np.linspace(0.0, 0.8), 'kernelSize2': np.arange(2, 10), 'stride2': np.arange(1, 5), 'dropout2': np.linspace(0.0, 0.8), 'learningRate': np.linspace(0.001, 0.1) } hyperspace = HyperSpace(hyperparameters) all_intervals = hyperspace.fold_space() hyperspaces = hyperspace.hyper_permute(all_intervals) subspace_keys, subspace_boundaries = hyperspace.format_hyperspace( hyperspaces) else: subspace_keys, subspace_boundaries = None, None space = comm.scatter(subspace_boundaries, root=0) deadline = DeadlineStopper(18000) # Gaussian process minimization (see scikit-optimize skopt module for other optimizers) res_gp = gp_minimize(objective, space, n_calls=50, callback=deadline, random_state=0, verbose=True) # Each worker will write their results to disk dump(res_gp, 'hyper_results/gp_subspace_' + str(rank))
def main(): if rank == 0: hyperparameters = { 'kernelSize1': np.arange(2, 12), 'stride1': np.arange(1, 10), 'kernelSize2': np.arange(2, 12), 'stride2': np.arange(1, 10), 'kernelSize3': np.arange(2, 12), 'kernelSize4': np.arange(1, 12), 'kernelSize5': np.arange(2, 12) } hyperspace = HyperSpace(hyperparameters) all_intervals = hyperspace.fold_space() hyperspaces = hyperspace.hyper_permute(all_intervals) subspace_keys, subspace_boundaries = hyperspace.format_hyperspace( hyperspaces) else: subspace_keys, subspace_boundaries = None, None space = comm.scatter(subspace_boundaries, root=0) deadline = DeadlineStopper(18000) # Gaussian process (see scikit-optimize skopt module for other optimizers) res_gp = gp_minimize(objective, space, n_calls=20, callback=deadline, random_state=0, verbose=True) dump(res_gp, 'hyper_results/gp_subspace_' + str(rank))
def bayes_search(estimator, search_spaces, X, y, fit_params=None, scoring=None, n_jobs=1, cv=None, n_points=1, n_iter=50, refit=False, random_state=9527, verbose=0, deadline=60): optimizer = BayesSearchCV(estimator, search_spaces, scoring=scoring, cv=cv, n_points=n_points, n_iter=n_iter, n_jobs=n_jobs, return_train_score=False, refit=refit, optimizer_kwargs={'base_estimator': 'GP'}, random_state=random_state) best_parmas = BatchTrainer.hyperopt_search( optimizer, X, y, fit_params=fit_params, title='BayesSearchCV', callbacks=[VerboseCallback(verbose), DeadlineStopper(deadline)]) return best_parmas
def tune_with_bayes(X_train, y_train): roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) time_split = TimeSeriesSplit(n_splits=10) cboost = CatBoostClassifier(thread_count=2, od_type='Iter', verbose=False) search_spaces = { 'iterations': Integer(10, 1000), 'depth': Integer(1, 8), 'learning_rate': Real(0.01, 1.0, 'log-uniform'), 'random_strength': Real(1e-9, 10, 'log-uniform'), 'bagging_temperature': Real(0.0, 1.0), 'border_count': Integer(1, 255), 'l2_leaf_reg': Integer(2, 30), 'scale_pos_weight': Real(0.01, 1.0, 'uniform') } opt = BayesSearchCV(cboost, search_spaces, scoring=roc_auc, cv=time_split, n_iter=100, n_jobs=1, return_train_score=False, refit=True, optimizer_kwargs={'base_estimator': 'GP'}, random_state=17) best_params = report_performance( opt, X_train, y_train, 'CatBoost', callbacks=[VerboseCallback(100), DeadlineStopper(60 * 10)]) best_params['iterations'] = 1000 tuned_model = CatBoostClassifier(**best_params, od_type='Iter', one_hot_max_size=10) # tuned_model = CatBoostClassifier(**best_params,task_type = "GPU",od_type='Iter',one_hot_max_size=10) tuned_model.fit(X_train, y_train) return tuned_model
def xgboost_grid(x, y): try: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) except: x = list(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) params = { "objective": ['reg:squarederror'], "colsample_bytree": [0.25, 0.5, 0.75], "learning_rate": [0.01, 0.1, 0.2, 0.3], "max_depth": [10, 20, 50], "gamma": [i * 0.05 for i in range(0, 5)], "lambda": [i * 0.05 for i in range(0, 4)], "alpha": [i * 0.05 for i in range(0, 4)], "eta": [i * 0.05 for i in range(0, 4)], "n_estimators": [400, 4000], "tree_method": ["gpu_hist"] } xgb_temp = xgb.XGBRegressor() reg = GridSearchCV(xgb_temp, params, verbose=5, cv=3) time_to_stop = 60 * 60 ckpt_loc = "../data/train/bayes/ckpt_bayes_xgboost.pkl" checkpoint_callback = CheckpointSaver(ckpt_loc) reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop), checkpoint_callback]) print(reg.best_params_) print(reg.best_score_) return reg
def get_params_SKopt(model, X, Y, space, cv_search, alg = 'catboost', cat_features = None, eval_dataset = None, UBM = False, opt_method = 'gbrt_minimize', verbose = True, multi = False, scoring = 'neg_mean_squared_error', n_best = 50, total_time = 7200): """The method performs parameters tuning of an algorithm using scikit-optimize library. Parameters: 1. 2. 3. multi - boolean, is used when a multioutput algorithm is tuned UPDATES: 1. In this current version, the support of the catboost algorithms is added """ if alg == 'catboost': fitparam = { 'eval_set' : eval_dataset, 'use_best_model' : UBM, 'cat_features' : cat_features, 'early_stopping_rounds': 20 } else: fitparam = {} @use_named_args(space) def objective(**params): model.set_params(**params) return -np.mean(cross_val_score(model, X, Y, cv=cv_search, scoring= scoring, fit_params=fitparam)) if opt_method == 'gbrt_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'EI',} reg_gp = gbrt_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [DeltaYStopper(delta = 0.01, n_best = 5), RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) elif opt_method == 'forest_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'EI',} reg_gp = forest_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) elif opt_method == 'gp_minimize': HPO_PARAMS = {'n_calls':1000, 'n_random_starts':20, 'acq_func':'gp_hedge',} reg_gp = gp_minimize(objective, space, n_jobs = -1, verbose = verbose, callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)], **HPO_PARAMS, random_state = RANDOM_STATE) TUNED_PARAMS = {} for i, item in enumerate(space): if multi: TUNED_PARAMS[item.name.split('__')[1]] = reg_gp.x[i] else: TUNED_PARAMS[item.name] = reg_gp.x[i] return [TUNED_PARAMS,reg_gp]
def _check_parameters(self): """Check the validity of the input parameters.""" if self.mapping is None: self.mapping = {str(v): v for v in sorted(self.y.unique())} if self.scaled is None: self.scaled = check_scaling(self.X) # Create model subclasses ================================== >> models = [] for m in self._models: if isinstance(m, str): acronym = get_acronym(m, must_be_equal=False) # Check if packages for non-sklearn models are available if acronym in OPTIONAL_PACKAGES: try: importlib.import_module(OPTIONAL_PACKAGES[acronym]) except ImportError: raise ValueError( f"Unable to import the {OPTIONAL_PACKAGES[acronym]} " "package. Make sure it is installed.") # Check for regression/classification-only models if self.goal.startswith("class") and acronym in ONLY_REG: raise ValueError( f"The {acronym} model can't perform classification tasks!" ) elif self.goal.startswith("reg") and acronym in ONLY_CLASS: raise ValueError( f"The {acronym} model can't perform regression tasks!") models.append(MODEL_LIST[acronym](self, acronym + m[len(acronym):])) elif not isinstance(m, BaseModel): # Model is custom estimator models.append(CustomModel(self, estimator=m)) else: # Model is already a model subclass (can happen with reruns) models.append(m) self._models = CustomDict({m.name: m for m in models}) # Check validity metric ==================================== >> if None in self._metric: self._metric = CustomDict(get_default_metric(self.task)) # Ignore if it's the same metric as previous call elif not all([hasattr(m, "name") for m in self._metric]): self._metric = self._prepare_metric( metric=self._metric, greater_is_better=self.greater_is_better, needs_proba=self.needs_proba, needs_threshold=self.needs_threshold, ) # Check validity sequential parameters ===================== >> for param in ["n_calls", "n_initial_points", "bagging"]: p = lst(getattr(self, param)) if len(p) != 1 and len(p) != len(self._models): raise ValueError( f"Invalid value for the {param} parameter. Length " "should be equal to the number of models, got len" f"(models)={len(self._models)} and len({param})={len(p)}.") for i, model in enumerate(self._models): if param in ("n_calls", "bagging") and p[i % len(p)] < 0: raise ValueError( f"Invalid value for the {param} parameter. " f"Value should be >=0, got {p[i % len(p)]}.") elif param == "n_initial_points" and p[i % len(p)] <= 0: raise ValueError( f"Invalid value for the {param} parameter. " f"Value should be >0, got {p[i % len(p)]}.") setattr(model, "_" + param, p[i % len(p)]) # Prepare bo parameters ===================================== >> # Choose a base estimator (GP is chosen as default) self._base_estimator = self.bo_params.get("base_estimator", "GP") if isinstance(self._base_estimator, str): if self._base_estimator.lower() not in ("gp", "et", "rf", "gbrt"): raise ValueError( f"Invalid value for the base_estimator parameter, got " f"{self._base_estimator}. Value should be one of: 'GP', " f"'ET', 'RF', 'GBRT'.") if self.bo_params.get("callbacks"): self._callbacks = lst(self.bo_params["callbacks"]) if "max_time" in self.bo_params: if self.bo_params["max_time"] <= 0: raise ValueError( "Invalid value for the max_time parameter. " f"Value should be >0, got {self.bo_params['max_time']}.") self._callbacks.append(DeadlineStopper(self.bo_params["max_time"])) if "delta_x" in self.bo_params: if self.bo_params["delta_x"] < 0: raise ValueError( "Invalid value for the delta_x parameter. " f"Value should be >=0, got {self.bo_params['delta_x']}.") self._callbacks.append(DeltaXStopper(self.bo_params["delta_x"])) if "delta_y" in self.bo_params: if self.bo_params["delta_y"] < 0: raise ValueError( "Invalid value for the delta_y parameter. " f"Value should be >=0, got {self.bo_params['delta_y']}.") self._callbacks.append( DeltaYStopper(self.bo_params["delta_y"], n_best=5)) if self.bo_params.get("plot"): self._callbacks.append(PlotCallback(self)) if "cv" in self.bo_params: if self.bo_params["cv"] <= 0: raise ValueError( "Invalid value for the max_time parameter. " f"Value should be >=0, got {self.bo_params['cv']}.") self._cv = self.bo_params["cv"] if "early_stopping" in self.bo_params: if self.bo_params["early_stopping"] <= 0: raise ValueError( "Invalid value for the early_stopping parameter. " f"Value should be >=0, got {self.bo_params['early_stopping']}." ) self._early_stopping = self.bo_params["early_stopping"] # Add custom dimensions to every model subclass if self.bo_params.get("dimensions"): for name, model in self._models.items(): # If not dict, the dimensions are for all models if not isinstance(self.bo_params["dimensions"], dict): model._dimensions = self.bo_params["dimensions"] else: # Dimensions for every specific model for key, value in self.bo_params["dimensions"].items(): # Parameters for this model only if key.lower() == name: model._dimensions = value break kwargs = [ "base_estimator", "max_time", "delta_x", "delta_y", "early_stopping", "cv", "callbacks", "dimensions", "plot", ] # The remaining bo_params are added as kwargs to the optimizer self._bo_kwargs = { k: v for k, v in self.bo_params.items() if k not in kwargs } # Prepare est_params ======================================= >> if self.est_params: for name, model in self._models.items(): params = {} for key, value in self.est_params.items(): # Parameters for this model only if key.lower() == name: params.update(value) # Parameters for all models elif key.lower() not in self._models.keys(): params.update({key: value}) for key, value in params.items(): if key.endswith("_fit"): model._est_params_fit[key[:-4]] = value else: model._est_params[key] = value
def xgboost_bayes_basic(x, y, csv_loc="../data/train/bayes.csv"): global csv_loc_bayes csv_loc_bayes = csv_loc try: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) except: x = list(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) xgb_temp = xgb.XGBRegressor() reg = BayesSearchCV(xgb_temp, { "colsample_bytree": Real(0.5, 0.99), "max_depth": Integer(5, 25), "lambda": Real(0, 0.25), "learning_rate": Real(0.1, 0.25), "alpha": Real(0, 0.2), "eta": Real(0, 0.1), "gamma": Real(0, 0.1), "n_estimators": Integer(500, 5000), "objective": ["reg:squarederror"], "tree_method": ["gpu_hist"] }, n_iter=10000, verbose=4, cv=3) time_to_stop = 60 * 60 * 47 now = datetime.now() year = now.strftime("%Y") month = now.strftime("%m") day = now.strftime("%d") hour = now.strftime("%H") minute = now.strftime("%M") sec = now.strftime("%S") #ckpt_loc = "../data/train/bayes/ckpt_bayes_xgboost" + str(year) + "_"+ str(month) + "_" + str(day) + "_" + \ # str(hour) + "_" + str(minute) + "_" + str(sec) + ".pkl" #checkpoint_callback = CheckpointSaver(ckpt_loc) #reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop), checkpoint_callback]) custom_scorer = custom_skopt_scorer reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop), custom_scorer(x, y)]) #reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop)]) score = str(mean_squared_error(reg.predict(x_test), y_test)) print("MSE score: " + str(score)) score = str(mean_absolute_error(reg.predict(x_test), y_test)) print("MAE score: " + str(score)) score = str(r2_score(reg.predict(x_test), y_test)) print("r2 score: " + str(score)) return reg
def hyperdrive(objective, hyperparameters, results_path, model="GP", n_iterations=50, verbose=False, checkpoints_path=None, deadline=None, sampler=None, n_samples=None, random_state=0): """ Distributed optimization - one optimization per node. Parameters ---------- * `objective` [function]: User defined function which calls a learner and returns a metric of interest. * `hyperparameters` [list, shape=(n_hyperparameters,)]: * `results_path` [string] Path to save optimization results * `checkpoint_path` [string] Path to previously saved results. Used to resume optimization. * `model` [string, default="GP"] Probilistic learner used to model our objective function. Options: - "GP": Gaussian process - "RF": Random forest - "GBRT": Gradient boosted regression trees - "RAND": Random search * `n_iterations` [int, default=50] Number of optimization iterations * `verbose` [bool, default=False] Verbosity of optimization. * `checkpoints` [bool, default=False] Whether to checkpoint at each step of the optimization. * `deadline` [int, optional] Deadline (seconds) for the optimization to finish within. * `sampler` [str, default=None] Random sampling scheme for optimizer's initial runs. Options: - "lhs": latin hypercube sampling * `n_samples` [int, default=None] Number of random samples to be drawn from the `sampler`. - Required if you would like to use `sampler`. - Must be <= the number of elements in the smallest hyperparameter bound's set. * `random_state` [int, default=0] Random state for reproducibility. """ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if checkpoints_path and sampler: raise ValueError('Cannot use both a restart from a previous run and ' \ 'use latin hypercube sampling for initial search points!') # Setup savefile if rank < 10: # Ensure results are sorted by rank filename = 'hyperspace' + str(0) + str(rank) else: filename = 'hyperspace' + str(rank) savefile = os.path.join(results_path, filename) # Create hyperspaces, and either sampling bounds or checkpoints hyperspace = create_hyperspace(hyperparameters) space = hyperspace[rank] # Latin hypercube sampling if sampler and not n_samples: raise ValueError(f'Sampler requires n_samples > 0. Got {n_samples}') elif sampler and n_samples: hyperbounds = create_hyperbounds(hyperparameters) bounds = hyperbounds[rank] # Get initial points in domain via latin hypercube sampling init_points = lhs_start(bounds, n_samples) init_response = None n_rand = 10 - len(init_points) else: init_points = None init_response = None n_rand = 10 # Resuming from checkpoint if checkpoints_path: checkpoint = _load_checkpoint(checkpoints_path, rank) try: init_points = checkpoint.x_iters init_response = checkpoint.func_vals n_rand = 10 - len(init_points) except AttributeError: # Missing saves won't have initial values. init_points = None init_response = None n_rand = 10 callbacks = [] if deadline: deadline = DeadlineStopper(deadline) callbacks.append(deadline) if checkpoints_path: checkpoint_callback = CheckpointSaver(checkpoints_path, filename) callbacks.append(checkpoint_callback) # Thanks Guido for refusing to believe in switch statements. # Case 0 if model == "GP": # Verbose mode should only run on node 0. if verbose and rank == 0: result = gp_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) else: result = gp_minimize(objective, space, n_calls=n_iterations, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) # Case 1 elif model == "RF": if verbose and rank == 0: result = forest_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) else: result = forest_minimize(objective, space, n_calls=n_iterations, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) # Case 2 elif model == "GBRT": if verbose and rank == 0: result = gbrt_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) else: result = gbrt_minimize(objective, space, n_calls=n_iterations, callback=callbacks, x0=init_points, y0=init_response, n_random_starts=n_rand, random_state=random_state) # Case 3 elif model == "RAND": if verbose and rank == 0: result = dummy_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=callbacks, x0=init_points, y0=init_response, random_state=random_state) else: result = dummy_minimize(objective, space, n_calls=n_iterations, callback=callbacks, x0=init_points, y0=init_response, random_state=random_state) else: raise ValueError("Invalid model {}. Read the documentation for " "supported models.".format(model)) # Each worker will independently write their results to disk dump(result, savefile)
def optimise_step(self, df_train, df_target, npoints=1, nrandom=1, n_iter=50, set_callbacks=True): """Evaluates the data. Build the pipeline. If no parameters are set, default configuration for each step is used Parameters ---------- space : dict, default = None. df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train,) The numerical encoded target for classification tasks. max_evals : int, default = 20, max evaluation times set_callbacks (opt): bool,default: True If callable then callback(res) is called after each call to func. If list of callables, then each callable in the list is called. ---------- Returns --------- result : dict - result['best_score'] : Best Score after Tuning - result['best_score_std'] : Standar Divation of best score - result['best_parmas'] : Best parameters - result['params'] : all paramsters (# = checked candicated) - result['time_cost(s)'] : total time of finding out the best parameters - result['all_cv_results'] : all cv results - result['mean_score_time'] : time for each cv result """ # checke parallel strategy ce = Categorical_encoder() X = ce.fit_transform(df_train, df_target) if len(df_train.dtypes[df_train.dtypes == 'float'].index) != 0: scal = Scaler() X = scal.fit_transform(X, df_target) self.perform_scaling is True else: pass mid_result = {} tuning_result = {} if len(pd.DataFrame(X).columns) > 20: search_space_LGB = Classifier( strategy="LightGBM").get_search_spaces( need_feature_selection=True) search_space_SVC = Classifier(strategy="SVC").get_search_spaces( need_feature_selection=True) search_spaces = [search_space_SVC, search_space_LGB] else: search_space_LGB = Classifier( strategy="LightGBM").get_search_spaces( need_feature_selection=False) search_space_SVC = Classifier(strategy="SVC").get_search_spaces( need_feature_selection=False) search_spaces = [search_space_SVC, search_space_LGB] # Initialize a pipeline fs = None for i in range(len(search_spaces)): if isinstance(search_spaces, tuple): for p in search_spaces[i][0].keys(): if (p.startswith("fs__")): fs = feature_selector() else: print( ">> Number of Features < 20, ignore feature selection" ) pass else: for p in search_spaces[i].keys(): if (p.startswith("fs__")): fs = feature_selector() else: pass # Do we need to cache transformers? cache = False if (fs is not None): if ("fs__strategy" in search_spaces): if (search_spaces["fs__strategy"] != "variance"): cache = True else: pass else: pass mprint(f'Start turning Hyperparameters .... ') print("") print(">>> Categorical Features have encoded with :" + str({'strategy': ce.strategy})) print("") if self.perform_scaling is True: print(">>> Numerical Features have encoded with :" + scal.__class__.__name__) print("") for baseestimator in self.baseEstimator: # Pipeline creation lgb = Classifier(strategy="LightGBM").get_estimator() # rf = Classifier(strategy="RandomForest").get_estimator() # svc = Classifier(strategy="SVC").get_estimator() if (fs is not None): if cache: pipe = Pipeline([('fs', fs), ('model', lgb)], memory=self.to_path) else: pipe = Pipeline([('fs', fs), ('model', lgb)]) else: if cache: pipe = Pipeline([('model', lgb)], memory=self.to_path) else: pipe = Pipeline([('model', lgb)]) if (self.parallel_strategy is True): opt = BayesSearchCV(pipe, search_spaces=search_spaces, scoring=self.scoring, cv=self.cv, npoints=npoints, n_jobs=-1, n_iter=n_iter, nrandom=nrandom, return_train_score=False, optimizer_kwargs={ 'base_estimator': baseestimator, "acq_func": "EI" }, random_state=self.random_state, verbose=self.verbose, refit=self.refit) else: opt = BayesSearchCV(pipe, search_spaces=search_spaces, scoring=self.scoring, cv=self.cv, npoints=npoints, n_jobs=1, n_iter=n_iter, nrandom=nrandom, return_train_score=False, optimizer_kwargs={ 'base_estimator': baseestimator, "acq_func": "EI" }, random_state=self.random_state, verbose=self.verbose, refit=self.refit) if not isinstance(baseestimator, GaussianProcessRegressor): if set_callbacks is True: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model:' + baseestimator, callbacks=[ self.on_step, DeadlineStopper(60 * 60) # ,DeltaYStopper(0.000001) ]) else: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model: ' + baseestimator, ) tuning_result[baseestimator] = mid_result else: if set_callbacks is True: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model:' + baseestimator.__class__.__name__, callbacks=[ self.on_step, DeadlineStopper(60 * 60) # ,DeltaYStopper(0.000001) ]) else: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model: ' + baseestimator.__class__.__name__, ) tuning_result[baseestimator.__class__.__name__] = mid_result bests = pd.DataFrame() for key in tuning_result.keys(): if tuning_result[key]['best_score'] == max( d['best_score'] for d in tuning_result.values()): bests = bests.append( { 'best_score': tuning_result[key]['best_score'], 'best_SM': key, 'time': tuning_result[key]['Time_cost'] }, ignore_index=True) bests = bests.sort_values( by=['time'], ascending=True).reset_index(drop=True) best_base_estimator = bests['best_SM'][0] best_param = tuning_result[best_base_estimator]['best_parmas'] print("") print('######## Congratulations! Here is the Best Parameters: #######') print('Best Score is:', tuning_result[best_base_estimator]['best_score']) try: print('with Surrogate Model ' + best_base_estimator) except: print('with Surrogate Model ' + best_base_estimator.__class__.__name__) pprint.pprint(best_param) self.best_param_ = best_param return best_param, tuning_result
def main(): #N_u = 50 #N_f = 10 #N_u2 = 25 #N_f2 = 500 #typen = 'N_f' #trialn = 0 #m = .1 #lambdas = [0.00001,0.00005, 0.0001,0.0005, 0.001, 0.005, 0.01] #lam = lambdas[0] args_parser = argparser_raissi.Parser() args = args_parser.parse_args_verified() layers = [2, 100, 100, 100, 100, 2] burgers_layers = [2, 20, 20, 20, 20, 20, 20, 20, 20, 1] input_seed = 1234 N_u = args.N_u N_f = args.N_f N_u2 = args.N_u2 N_f2 = args.N_f2 m = args.m nsec = args.time #preparing actual solution u data = scipy.io.loadmat('burgers_shock.mat') t = data['t'].flatten()[:, None] x = data['x'].flatten()[:, None] X, T = np.meshgrid(x, t) udata = np.real(data['usol']) u = udata.T.flatten()[:, None] X_star = np.hstack((X.flatten()[:, None], T.flatten()[:, None])) ub = X_star.max() lb = X_star.min() #preparing training/input data #with open('train_d.p', 'rb') as fp: # inputs = pickle.load(fp)[trialn] #X_u_train = inputs.X_u_train[:N_u] #X_f_train = inputs.X_f_train[:2**N_f] #u_train = inputs.u_train[:N_u] inputs = interiorburgerslambda.prepare_nn_inputs_burgers( 'burgers_shock.mat', N_u, N_f, N_u2, N_f2, m, typen, debugging=False) u_input = inputs.u_train #model = burgersraissi.PhysicsInformedNN(inputs.X_u_train, u_input, inputs.X_f_train, burgers_layers, inputs.lb, inputs.ub, inputs.nu, inputs.X_star, N_u, N_f) errors = [] def function(lam): #declaring, training model model = burgersraissilambda.PhysicsInformedNN( lam, inputs.X_u_train, inputs.u_train, inputs.X_f_train, burgers_layers, lb, ub, inputs.nu, X_star, N_u, N_f, N_u2, N_f2, m, typen) start_time = time.time() #if N_f > 0: #model.load_weights_and_biases('wab/weights_and_biases_%s_%s_%s_%s.npz' % (N_u, N_f - 1, typen, args.epochs)) losses = model.train(args.epochs, args.data_loc, N_u, N_f, N_u2, N_f2, m, typen, args.base_plot_dir) #print('Training time: %.4f' % (time.time() - start_time)) #print(losses) #plt.close() #fig, ax = plt.subplots(1, 1, figsize=(10, 10)) #pd.Series(losses).plot(logy=True, ax=ax) #lpl = os.path.expanduser(args.lossplot_loc) #plt.savefig(lpl) #print("saved loss plot to {}".format(lpl)) u_pred, f_pred = model.predict( inputs.X_star) # X_star = tf.convert_to_tensor(X_star) ? error = np.linalg.norm(u - u_pred, 2) / 25000 return error #start = time() #t = (0,0.1, "prior") print(type(t)) l = skopt.gp_minimize(function, [(.01, 1.1)], callback=DeadlineStopper(nsec)) print(m) print(N_u2) print(N_f2) print(type(l)) print(l) with open('final_lambda.p', 'wb') as fp: pickle.dump({'lambda': l}, fp, protocol=2)
def hyperdrive(objective, hyperparameters, results_path, model="GP", n_iterations=50, verbose=False, deadline=None, sampler=None, n_samples=None, random_state=0): """ Distributed optimization - one optimization per node. Parameters ---------- * `objective` [function]: User defined function which calls a learner and returns a metric of interest. * `hyperparameters` [list, shape=(n_hyperparameters,)]: * `results_path` [string] Path to save optimization results * `model` [string, default="GP"] Probilistic learner used to model our objective function. Options: - "GP": Gaussian process - "RF": Random forest - "GBRT": Gradient boosted regression trees - "RAND": Random search * `n_iterations` [int, default=50] Number of optimization iterations * `verbose` [bool, default=False] Verbosity of optimization. * `deadline` [int, optional] Deadline (seconds) for the optimization to finish within. * `sampler` [str, default=None] Random sampling scheme for optimizer's initial runs. Options: - "lhs": latin hypercube sampling * `n_samples` [int, default=None] Number of random samples to be drawn from the `sampler`. - Required if you would like to use `sampler`. - Must be <= the number of elements in the smallest hyperparameter bound's set. * `random_state` [int, default=0] Random state for reproducibility. """ comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank == 0: hyperspace = create_hyperspace(hyperparameters) if sampler and not n_samples: raise ValueError( 'Sampler requires n_samples > 0. Got {}'.format(n_samples)) elif sampler and n_samples: hyperbounds = create_hyperbounds(hyperparameters) else: hyperspace = None if sampler is not None: hyperbounds = None space = comm.scatter(hyperspace, root=0) if sampler: bounds = comm.scatter(hyperbounds, root=0) # Get initial points in the obj. function domain via latin hypercube sampling init_points = lhs_start(bounds, n_samples) n_rand = 10 - len(init_points) else: init_points = None n_rand = 10 if deadline: deadline = DeadlineStopper(deadline) # Thanks Guido for refusing to believe in switch statements. # Case 0 if model == "GP": # Verbose mode should only run on node 0. if verbose and rank == 0: result = gp_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) else: result = gp_minimize(objective, space, n_calls=n_iterations, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) # Case 1 elif model == "RF": if verbose and rank == 0: result = forest_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) else: result = forest_minimize(objective, space, n_calls=n_iterations, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) # Case 2 elif model == "GRBRT": if verbose and rank == 0: result = gbrt_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) else: result = gbrt_minimize(objective, space, n_calls=n_iterations, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) # Case 3 elif model == "RAND": if verbose and rank == 0: result = dummy_minimize(objective, space, n_calls=n_iterations, verbose=verbose, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) else: result = dummy_minimize(objective, space, n_calls=n_iterations, callback=deadline, x0=init_points, n_random_starts=n_rand, random_state=random_state) else: raise ValueError("Invalid model {}. Read the documentation for " "supported models.".format(model)) # Each worker will independently write their results to disk dump(result, results_path + '/hyperspace' + str(rank))
# 'boosting_type':['Ordered'], # 'learning_rate': Real(0.05, 1.0, 'uniform'), # 'border_count': Integer(1, 25), # 'fold_len_multiplier': Real(1.1, 1.16, prior='uniform')} # Setting up BayesSearchCV opt = BayesSearchCV( clf, search_spaces, scoring=roc_auc, cv=skf, n_iter=5, n_points=100, n_jobs= 1, # use just 1 job with CatBoost in order to avoid segmentation fault return_train_score=False, refit=True, optimizer_kwargs={'base_estimator': 'ET'}, #'GP', 'RF', 'ET' random_state=57) # Running the optimization best_params = report_perf( opt, X, y, 'CatBoost', callbacks=[VerboseCallback(20), DeadlineStopper(60 * 30)]) print("Notebook Runtime: %0.2f Minutes" % ((time.time() - notebookstart) / 60))
cv=skf, n_iter=10, n_jobs=1, # use just 1 job with CatBoost in order to avoid segmentation fault return_train_score=False, refit=True, optimizer_kwargs={'base_estimator': 'GP'}, random_state=42) # COMMAND ---------- # Execute bayesian best_params = report_perf(optimizer = opt, X = X, y = y, title = 'CatBoost', callbacks=[VerboseCallback(100), DeadlineStopper(60*10)]) # COMMAND ---------- # Convert ordered dictionary to dictionary import json best_params_bayesian = json.loads(json.dumps(best_params)) # COMMAND ---------- # Manuelt best_params={'bagging_temperature': 0.41010395885331385, 'border_count': 186, 'depth': 8,
print((title + " took %.2f seconds, candidates checked: %d, best CV score: %.3f " + u"\u00B1" + " %.3f") % (time() - start, len(optimizer.cv_results_['params']), best_score, best_score_std)) print('Best parameters:') pprint.pprint(best_params) print() return best_params # Converting average precision score into a scorer suitable for model selection avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True) # Setting a 5-fold stratified cross-validation (note: shuffle=True) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) opt = BayesSearchCV(clf, search_spaces, scoring=avg_prec, cv=skf, n_iter=40, n_jobs=-1, refit=True, optimizer_kwargs={'base_estimator': 'GP'}, random_state=22, return_train_score=False, ) best_params = report_perf(opt, X, y, 'LightGBM', callbacks=[DeltaXStopper(0.001), DeadlineStopper(60 * 5)])
bayes_search = BayesSearchCV(estimator=clf_lgb, search_spaces=bayes_space, n_iter=100, cv=rskf, scoring='roc_auc', optimizer_kwargs={'base_estimator': 'GP'}, verbose=-1, n_jobs=-1, random_state=1337) start_time = time.time() bayes_search = bayes_search.fit( X_train, y_train, callback=[DeltaXStopper(0.0001), DeadlineStopper(60 * 60)]) print('Training time: {} minutes'.format( time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))) bayes_search.best_params_, bayes_search.best_score_ # last step clf_lgb_bayes = bayes_search.best_estimator_ y_pred = clf_lgb_bayes.predict(X_test) print(classification_report(y_test, y_pred)) y_pred = clf_lgb_bayes.predict_proba(X_test)[:, 1] print('LGB_BAYES AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred)) # not the best the params====================================================== #[grid_search, random_search, bayes_search]
random_state=42) # COMMAND ---------- opt # COMMAND ---------- # Execute bayesian best_params = report_perf( optimizer=opt, X=X, y=y, title='CatBoost', callbacks=[VerboseCallback(100), DeadlineStopper(60 * 10)]) # COMMAND ---------- best_params.values() # COMMAND ---------- best_params = { 'bagging_temperature': 0.41010395885331385, 'border_count': 186, 'depth': 8, 'iterations': 323, 'l2_leaf_reg': 21, 'learning_rate': 0.0673344419215237, 'random_strength': 3.230824361824754e-06,
X_test = scaler.transform(X_test)[:100] y_train = y_train[:400] y_test = y_test[:100] joblib.dump(scaler, "dataset/%s.scalar"%name) print("load data success!") ### test baseline model test_baseline() allmodels = ["MLP","GBT","RF","SVR"] for mname in allmodels: ### bayesian optimization selected model opt = get_model(mname,30) print("training start") callbacks = [DeadlineStopper(60*60*3),report_callback,VerboseCallback(50)] res = opt.fit(X_train, y_train,callback=callbacks) print("best params : %s" % opt.best_params_) print("best val. score: %s" % opt.best_score_) print("test r2: %s" % r2_score(y_test,opt.predict(X_test))) result = pd.DataFrame(opt.cv_results_) result.to_csv("result/{name}+{mtype}.csv".format(name=name,mtype=mname)) best_model = opt.best_estimator_ y_pred = best_model.predict(X_test) score = scoring(y_test,y_pred) print("test score: ",score) metric = "_".join([str(v) for v in score.values()]) joblib.dump(best_model,"result/{}_{}_best_{}.model".format(name,mname,metric))