def fit(self, df: pd.DataFrame): df_features = self._to_feature_df(df, True) df_features = df_features.dropna() df_features = df_features.sample(frac=1, random_state=42) X = self._get_X(df_features) y = self._get_y(df_features) if self.optimize_hyperparams: def scorer(estimator, X, y): y_pred = np.clip(np.squeeze(estimator.predict(X)), 0.0, 1.0) return -mean_absolute_error(y, y_pred) print( f'IouEstimator: optimizing hyperparams with Bayesian Optimization' ) opt = BayesSearchCV( LGBMRegressor(), { 'num_leaves': Integer(2, 128, prior='log-uniform', base=2), 'min_child_samples': Integer(2, 512, prior='log-uniform', base=2), 'max_bin': Integer(2, 8192, prior='log-uniform', base=2), }, n_iter=60, optimizer_kwargs={ 'n_initial_points': 20, 'base_estimator': 'GP', }, scoring=scorer, cv=3, refit=False, random_state=42, return_train_score=True, ) opt.fit(X, y) print(f'Found hyperparams {opt.best_params_}') print( f"Train score: {opt.cv_results_['mean_train_score'][opt.best_index_]}" ) print(f'Test score: {opt.best_score_}') estimator = LGBMRegressor(**opt.best_params_) elif self.hyperparams is not None: print(f'IOUEstimator: using using hyperparams {self.hyperparams}') estimator = LGBMRegressor(**self.hyperparams) else: print( f'IOUEstimator: using default hyperparams {self.DEFAULT_HYPERPARAMS}' ) estimator = LGBMRegressor(**self.DEFAULT_HYPERPARAMS) self.estimator_ = estimator.fit(X, y) return self
def load_search_space(search_space): """ Load the search space from the json file :param search_space: dictionary of the search space (insertable in a json file) :type dict: :return: dictionary for the search space (for scikit optimize) :rtype: dict """ from skopt.space.space import Real, Categorical, Integer ss = dict() for key in list(search_space.keys()): if search_space[key][0] == 'Real': ss[key] = Real(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Integer': ss[key] = Integer(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Categorical': ss[key] = Categorical(categories=search_space[key][1]) return ss
def extract_search_space(flat_search_config): """ Find the variable dimensions and convert them to a skopt search space. """ search_space = OrderedDict() for k,v in flat_search_config.items(): # Lists with more than one value are search dimensions if isinstance(v, list) and len(v) > 1: force_categorical = len(v) > 2 # Dedupe the list, escaping specials, and sort smallest to largest ds = sorted({escape_special(u) for u in v}) prior = flat_search_config.get(f'{k}__PRIOR', None) base = flat_search_config.get(f'{k}__BASE', 10) if force_categorical or isinstance(ds[0], str): transform = flat_search_config.get(f'{k}__TRANSFORM', 'onehot') dim = Categorical(ds, prior=prior, transform=transform, name=k) elif isinstance(ds[0], int): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Integer(*tuple(ds), prior=prior, transform=transform, base=base, name=k) elif isinstance(ds[0], float): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Real(*tuple(ds), prior=prior, transform=transform, base=base, name=k) search_space[k] = dim return search_space
def test_custom_dimensions_per_model(): """Assert that the custom dimensions are distributed over the models.""" trainer = DirectClassifier( models=["LR1", "LR2"], n_calls=2, n_initial_points=2, bo_params={ "dimensions": { "lr1": [Integer(100, 200, name="max_iter")], "lr2": [Integer(300, 400, name="max_iter")], }, }, random_state=1, ) trainer.run(bin_train, bin_test) assert 100 <= trainer.lr1.best_params["max_iter"] <= 200 assert 300 <= trainer.lr2.best_params["max_iter"] <= 400
def test_custom_dimensions_all_models(): """Assert that the custom dimensions are for all models if not dict.""" trainer = DirectClassifier( models=["LR1", "LR2"], n_calls=2, n_initial_points=2, bo_params={"dimensions": [Integer(100, 1000, name="max_iter")]}, random_state=1, ) trainer.run(bin_train, bin_test) assert list(trainer.lr1.best_params.keys()) == ["max_iter"] assert list(trainer.lr2.best_params.keys()) == ["max_iter"]
def map_dim(values): if isinstance(values, tuple): # linear subspace low, high, n_steps, value_type = values if value_type == 'i': return Integer(low, high) elif value_type == 'f': return Real(low, high) else: raise ValueError(f'Unknown value type "{value_type}"') else: # exhaustive list of options return Categorical(values)
def opt_space(self, groups): """Internal - generates optimization space for given groups for the board""" space = [] for g in groups: gp = self.board.params.group_params(g) for p in gp: if not p.no_opt: ##min_v = p.min_v ##max_v = p.max_v min_v = p.opt_minv if p.opt_minv is not None else p.min_v max_v = p.opt_maxv if p.opt_maxv is not None else p.max_v space.extend([Integer(min_v, max_v, name=p.key)]) return space
def main(arguments): global train_mode EVALS = 50 use_mp = True run_all = False selected_exp = [] selected_datasets = [] if '--build_datasets' in arguments: print('Building all necessary datasets required for the experiments. Disregarding other arguments! ' + 'You will need to run this script again without --build_datasets in order to run experiments!') # Make all datasets for d in all_datasets: load_URMs(d, dataset_kwargs) return if '--no_mp' in arguments: print('No multiprocessing requested! Falling back to serial execution of experiments!') use_mp = False arguments.remove('--no_mp') if '--run_all' in arguments: print('All datasets selected for each algorithm!') selected_datasets = all_datasets run_all = True # user-based 训练 if '--user' in arguments: train_mode = 'user' # item-based 训练 if '--item' in arguments: train_mode = 'item' for arg in arguments: if not run_all and arg in name_datasets: selected_datasets.append(all_datasets[name_datasets.index(arg)]) if arg in all_recommenders: selected_exp.append(arg) dict_rec_classes = {} dict_dimensions = {} dict_fit_params = {} dict_init_configs = {} # Experiment parameters # puresvd参数 puresvd_dimensions = [ Integer(1, 250, name='num_factors', dtype=int) ] puresvd_fit_params = [d.name for d in puresvd_dimensions] # als参数 ials_dimensions = [ Integer(1, 250, name='num_factors', dtype=int), Categorical(["linear", "log"], name='confidence_scaling'), Real(low=1e-3, high=50, prior='log-uniform', name='alpha', dtype=float), Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg', dtype=float), Real(low=1e-3, high=10.0, prior='log-uniform', name='epsilon', dtype=float) ] ials_fit_params = [d.name for d in ials_dimensions] # bpr参数 150epcochs bpr_dimensions = [ Categorical([150], name='epochs'), Integer(1, 250, name='num_factors', dtype=int), Categorical([128, 256, 512, 1024], name='batch_size'), Categorical(["adagrad", "adam"], name='sgd_mode'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'), Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'), ] bpr_fit_params = [d.name for d in bpr_dimensions] # nmf参数 nmf_dimensions = [ Integer(1, 500, name='num_factors', dtype=int), Real(low=1e-5, high=1, prior='log-uniform', name='l1_ratio', dtype=float), Categorical(['coordinate_descent', 'multiplicative_update'], name='solver'), Categorical(['nndsvda'], name='init_type'), Categorical(['frobenius', 'kullback-leibler'], name='beta_loss') ] nmf_fit_params = [d.name for d in nmf_dimensions] # slimbpr参数 150epochs slimbpr_dimensions = [ Integer(low=5, high=1000, prior='uniform', name='topK', dtype=int), Categorical([150], name='epochs'), Categorical([True, False], name='symmetric'), Categorical(["sgd", "adagrad", "adam"], name='sgd_mode'), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_i', dtype=float), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_j', dtype=float), Real(low=1e-4, high=1e-1, prior='log-uniform', name='learning_rate', dtype=float) ] slimbpr_fit_names = [d.name for d in slimbpr_dimensions] # cfgan参数 cfgan_dimensions = [ Categorical([300], name='epochs'), Integer(1, 5, prior='uniform', name='d_steps', dtype=int), Integer(1, 5, prior='uniform', name='g_steps', dtype=int), Integer(1, 5, prior='uniform', name='d_layers', dtype=int), Integer(1, 5, prior='uniform', name='g_layers', dtype=int), Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'), Categorical(['ZR', 'PM', 'ZP'], name='scheme'), Categorical([64, 128, 256, 512, 1024], name='d_batch_size'), Categorical([64, 128, 256, 512, 1024], name='g_batch_size'), Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zr_coefficient', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] cfgan_fit_params = [d.name for d in cfgan_dimensions] # ganmf参数 ganmf_dimensions = [ Categorical([300], name='epochs'), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), # Integer(5, 400, name='emb_dim', dtype=int), # Integer(1, 10, name='d_steps', dtype=int), # Integer(1, 10, name='g_steps', dtype=int), # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] ganmf_fit_params = [d.name for d in ganmf_dimensions] # disgan参数 disgan_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Integer(low=1, high=5, prior='uniform', name='d_layers', dtype=int), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float) ] disgan_fit_params = [d.name for d in disgan_dimensions] # deepganmf参数 deepganmf_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_output_act'), Categorical([1, 3, 5], name='d_layers'), Categorical([1, 2, 3, 4, 5], name='g_layers'), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), ] deepganmf_fit_params = [d.name for d in deepganmf_dimensions] dict_rec_classes['TopPop'] = TopPop dict_rec_classes['Random'] = Random dict_rec_classes['PureSVD'] = PureSVDRecommender dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython dict_rec_classes['ALS'] = IALSRecommender dict_rec_classes['NMF'] = NMFRecommender dict_rec_classes['GANMF'] = GANMF dict_rec_classes['CFGAN'] = CFGAN dict_rec_classes['DisGANMF'] = DisGANMF dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython dict_rec_classes['DeepGANMF'] = DeepGANMF dict_dimensions['TopPop'] = [] dict_dimensions['Random'] = [] dict_dimensions['PureSVD'] = puresvd_dimensions dict_dimensions['BPR'] = bpr_dimensions dict_dimensions['ALS'] = ials_dimensions dict_dimensions['NMF'] = nmf_dimensions dict_dimensions['GANMF'] = ganmf_dimensions dict_dimensions['CFGAN'] = cfgan_dimensions dict_dimensions['DisGANMF'] = disgan_dimensions dict_dimensions['SLIMBPR'] = slimbpr_dimensions dict_dimensions['DeepGANMF'] = deepganmf_dimensions dict_fit_params['TopPop'] = [] dict_fit_params['Random'] = [] dict_fit_params['PureSVD'] = puresvd_fit_params dict_fit_params['BPR'] = bpr_fit_params dict_fit_params['ALS'] = ials_fit_params dict_fit_params['NMF'] = nmf_fit_params dict_fit_params['GANMF'] = ganmf_fit_params dict_fit_params['CFGAN'] = cfgan_fit_params dict_fit_params['DisGANMF'] = disgan_fit_params dict_fit_params['SLIMBPR'] = slimbpr_fit_names dict_fit_params['DeepGANMF'] = deepganmf_fit_params pool_list_experiments = [] pool_list_dimensions = [] for exp in selected_exp: for d in selected_datasets: new_exp = RecSysExp(dict_rec_classes[exp], dataset=d, fit_param_names=dict_fit_params[exp], method='bayesian', seed=seed) if use_mp: pool_list_experiments.append(new_exp) pool_list_dimensions.append(dict_dimensions[exp]) else: new_exp.tune(dict_dimensions[exp], evals=EVALS, init_config=dict_init_configs[exp] if exp in dict_init_configs else None) if use_mp: # Need to turn off MKL's own threading mechanism in order to use MP # https://github.com/joblib/joblib/issues/138 os.environ['MKL_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_DYNAMIC'] = 'FALSE' pool = mp.Pool(initializer=set_affinity_on_worker) pool.starmap_async(run_exp, zip(pool_list_experiments, pool_list_dimensions, [EVALS]*len(pool_list_experiments))) pool.close() pool.join()
def tune(self, params, evals=10, init_config=None, seed=None): """ Runs the hyperparameter search using Gaussian Process as surrogate model or Random Search, saves the results of the trials and print the best found parameters. 使用 高斯过程 作为 替代模型 进行 超参数 搜索 或 随机搜索 保存 并 打印 训练 得到的 最佳 参数 Parameters ---------- params: list List of skopt.space.space.Dimensions to be searched. 参数为 scikit-learn Base class for search space dimensions evals: int Number of evaluations to perform. init_config: list, default None An initial parameter configuration for seeding the Gaussian Process seed: int, default None Seed for random_state of `gp_minimize` or `dummy_minimize`. Set to a fixed integer for reproducibility. """ msg = 'Started ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name subprocess.run(['telegram-send', msg]) # URM_test CSR矩阵的shape U, I = self.URM_test.shape if self.recommender_class == GANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='emb_dim', dtype=int)) self.fit_param_names.append('emb_dim') if self.recommender_class == CFGAN or self.recommender_class == DeepGANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int)) params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='g_nodes', dtype=int)) self.fit_param_names.append('d_nodes') self.fit_param_names.append('g_nodes') if self.recommender_class == DisGANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int)) self.fit_param_names.append('d_nodes') self.dimension_names = [p.name for p in params] ''' Need to make sure that the max. value of `num_factors` parameters must be lower than the max(U, I) ''' try: idx = self.dimension_names.index('num_factors') maxval = params[idx].bounds[1] if maxval > min(U, I): params[idx] = Integer(1, min(U, I), name='num_factors', dtype=int) except ValueError: pass if len(params) > 0: # Check if there is already a checkpoint for this experiment 检查点 checkpoint_path = os.path.join(self.logsdir, 'checkpoint.pkl') checkpoint_exists = True if os.path.exists(checkpoint_path) else False checkpoint_saver = CheckpointSaver(os.path.join(self.logsdir, 'checkpoint.pkl'), compress=3) if seed is None: seed = self.seed t_start = int(time.time()) if checkpoint_exists: previous_run = skopt.load(checkpoint_path) if self.method == 'bayesian': results = gp_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals), x0=previous_run.x_iters, y0=previous_run.func_vals, n_random_starts=0, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: results = dummy_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals), x0=previous_run.x_iters, y0=previous_run.func_vals, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: # 超参数优化 if self.method == 'bayesian': results = gp_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: results = dummy_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True, callback=[checkpoint_saver]) t_end = int(time.time()) # Save best parameters of this experiment # best_params = dict(zip(self.dimension_names, results.x)) # with open(os.path.join(self.logsdir, 'best_params.pkl'), 'wb') as f: # pickle.dump(best_params, f, pickle.HIGHEST_PROTOCOL) best_params = self.load_best_params() with open(os.path.join(self.logsdir, 'results.txt'), 'a') as f: f.write('Experiment ran for {}\n'.format(str(datetime.timedelta(seconds=t_end - t_start)))) f.write('Best {} score: {}. Best result found at: {}\n'.format(self.metric, results.fun, best_params)) if self.recommender_class in [IALSRecommender, MatrixFactorization_BPR_Cython]: self.dimension_names.append('epochs') self.build_fit_params(best_params.values()) # Retrain with all training data set_seed(seed) if self.isGAN: model = self.recommender_class(self.URM_train, mode=train_mode, is_experiment=True) model.logsdir = self.logsdir model.fit(**self.fit_params) # load_models(model, save_dir='best_model', all_in_folder=True) else: model = self.recommender_class(self.URM_train) model.fit(**self.fit_params) # model.loadModel(os.path.join(self.logsdir, 'best_model')) _, results_run_string = self.evaluatorTest.evaluateRecommender(model) print('\n\nResults on test set:') print(results_run_string) print('\n\n') with open(os.path.join(self.logsdir, 'result_test.txt'), 'w') as f: f.write(results_run_string) msg = 'Finished ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name subprocess.run(['telegram-send', msg])
fig_1, ax_1, fig_2, ax_2 = plotStats(stats, keys) plt.show() validAccs = stats[:, -1] length10percent = len(validAccs) // 10 best10percent = np.sort(validAccs)[-length10percent:] # We want to maximise the MEAN validation accuracy, # i.e. minimise minus return -np.mean(best10percent) # In[14]: inputKeepProbSpace = Real(0.5, 1.0, "uniform") hiddenKeepProbSpace = Real(0.5, 1.0, "uniform") hiddenDimSpace = Integer(20, 2000) lamda2Space = Real(1e-3, 10, "log-uniform") space = [inputKeepProbSpace, hiddenKeepProbSpace, hiddenDimSpace, lamda2Space] # TARGET IS 58% as the original Deep Neural Net # In[15]: if jupyterNotebookEnabled: get_ipython().magic(u'%time') #this might crash so you need to run it outside as a python file (file -> save as python) if not os.path.isfile(res_gp_save_filename): if os.path.isfile(statsCollectionFilename): os.remove(statsCollectionFilename) #erase before executing
def startExperiment(parameters): """ Starts an experiment with the given parameters :param parameters: parameters of the experiment :type parameters: Dict """ optimizationPath = str( os.path.join(parameters["path"], parameters["experimentId"])) json_file = str( os.path.join(optimizationPath, parameters["experimentId"] + ".json")) if os.path.isfile(json_file): Optimizer = importOptimizer() optimizer = Optimizer() optimizer.resume_optimization(json_file) else: # Import dataset class and initialize an instance with the chosen dataset dataset_class = importDataset() dataset = dataset_class() dataset_path = str( os.path.join(pathDataset, "preprocessed_datasets", parameters["dataset"])) dataset.load_custom_dataset_from_folder(dataset_path) model_class = importModel(parameters["model"]["name"]) model = model_class() model.hyperparameters.update(parameters["model"]["parameters"]) model.partitioning(parameters["partitioning"]) search_space = {} for key, value in parameters["optimization"]["search_spaces"].items(): if "low" in value: if isinstance(value["low"], float) or isinstance( value["high"], float): search_space[key] = Real(low=value["low"], high=value["high"]) else: search_space[key] = Integer(low=value["low"], high=value["high"]) else: search_space[key] = Categorical(value) metric_parameters = parameters["optimize_metrics"][0]["parameters"] for key in metric_parameters: if metric_parameters[key] == "use dataset texts": metric_parameters[key] = dataset.get_corpus() elif metric_parameters[key] == "use selected dataset": metric_parameters[key] = dataset elif os.path.isdir(str(metric_parameters[key])): metricDataset = dataset_class() metricDataset.load_custom_dataset_from_folder( metric_parameters[key]) metric_parameters[key] = metricDataset.get_corpus() metric_class = importMetric(parameters["optimize_metrics"][0]["name"]) metric = metric_class(**metric_parameters) metrics_to_track = [] for single_metric in parameters["track_metrics"]: metric_class = importMetric(single_metric["name"]) single_metric_parameters = single_metric["parameters"] for key in single_metric_parameters: if single_metric_parameters[key] == "use dataset texts": single_metric_parameters[key] = dataset.get_corpus() elif single_metric_parameters[key] == "use selected dataset": single_metric_parameters[key] = dataset new_metric = metric_class(**single_metric_parameters) metrics_to_track.append(new_metric) vocabulary_path = str( os.path.join(parameters["path"], parameters["experimentId"], "models")) Path(vocabulary_path).mkdir(parents=True, exist_ok=True) vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json")) file = open(vocabulary_path, "w") json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file) file.close() Optimizer = importOptimizer() optimizer = Optimizer() optimizer.optimize( model, dataset, metric, search_space, metrics_to_track, random_state=True, initial_point_generator="random", surrogate_model=parameters["optimization"]["surrogate_model"], model_runs=parameters["optimization"]["model_runs"], n_random_starts=parameters["optimization"]["n_random_starts"], acq_func=parameters["optimization"]["acquisition_function"], number_of_call=parameters["optimization"]["iterations"], save_models=True, save_name=parameters["experimentId"], save_path=optimizationPath)
filename = stats_coll_filename statsCollection = np.load(filename)[()] if os.path.isfile(filename) else dict() statsCollection[(state_size, num_steps, learning_rate)] = stats np.save(filename, statsCollection) if plotting: fig_1, ax_1, fig_2, ax_2 = plotStats(stats, DynStats.keys) plt.show() # We want to minimize the amount of epochs required to reach 23% accuracy return metric # In[13]: stateSizeSpace = Integer(15, 1000) numStepSpace = Categorical(numLens) learningRateSpace = Real(1e-6, 1e-1, prior="log-uniform") space = [stateSizeSpace, numStepSpace, learningRateSpace] # In[14]: if jupyterNotebookEnabled: get_ipython().magic(u'%time') if not os.path.isfile(best_params_filename): if os.path.isfile(stats_coll_filename): os.remove(stats_coll_filename) res_gp = gp_minimize(
# numerical pipeline numeric_pipeline = Pipeline([('select_numeric', TypeSelector(dtype='number'))]) # processing pipeline cat_num_featun = FeatureUnion([('categorical', categorical_pipeline), ('numerical', numeric_pipeline)]) # combined pipeline estimator_pipeline = Pipeline([('Features', feature_pipeline), ('Categorical_Numeric', cat_num_featun), ('Estimator', RandomForestClassifier())]) # search space search_space = { "Estimator__n_estimators": Integer(35, 45), "Estimator__min_samples_split": Integer(95, 105), } # scorer metric = make_scorer(score_func=log_loss, greater_is_better=False, needs_proba=True, labels=train['Category'].unique()) # cv kfold_cv = KFold(n_splits=10, shuffle=True, random_state=42) # bayessearch cv bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline, search_spaces=search_space,