Пример #1
0
 def fmin_thread_fn(space, trials, max_evals, seed):
     fmin(fn=passthrough,
          space=space,
          algo=rand.suggest,
          trials=trials,
          rstate=np.random.RandomState(seed),
          max_evals=max_evals,
          return_argmin=False)
Пример #2
0
 def fmin_thread_fn(space, trials, max_evals, seed):
     fmin(
         fn=passthrough,
         space=space,
         algo=rand.suggest,
         trials=trials,
         rstate=np.random.RandomState(seed),
         max_evals=max_evals,
         return_argmin=False)
Пример #3
0
def fmin_thread_fn(space, mongo_trials: MongoTrials, max_evals: int):
    fmin(
        fn=objective_with_attachments,
        space=space,
        algo=rand.suggest,
        trials=mongo_trials,
        rstate=np.random.RandomState(),
        max_evals=max_evals,
        return_argmin=False,
    )
Пример #4
0
 def test_basic(self):
     bandit = self._bandit_cls()
     #print 'bandit params', bandit.params, bandit
     #print 'algo params', algo.vh.params
     trials = Trials()
     fmin(lambda x: x, bandit.expr,
          trials=trials,
          algo=suggest,
          max_evals=self._n_steps)
     assert trials.average_best_error(bandit) - bandit.loss_target  < .2
Пример #5
0
    def fit(self, X_train, y_train):

        if self.best_score_ is None:
            self.best_score_ = -float('inf')

        self.X_train = X_train
        self.y_train = y_train

        self.estimator = sklearn.base.clone(self.estimator)

        best_params = fmin(fn=self.__objective,
                           space=self.search_space,
                           algo=tpe.suggest,
                           max_evals=self.n_iter,
                           rstate=RandomState(42))
        evaluated = space_eval(self.search_space, best_params)
        if self.verbose:
            print('best params: ', evaluated)

        casted_best_params = self.__cast_params(evaluated)
        self.best_params_ = casted_best_params

        self.estimator.set_params(**casted_best_params)
        self.estimator.fit(self.X_train, self.y_train)

        return self
Пример #6
0
    def fit(self, X, y, indicator):
        '''
        indicator=1 means we intend to do just sampling and one-time fitting
        for evaluating a fixed set of hyper-parameters, 
        0 means run hyperopt to search in the neighborhood of the seed 
        hyper-parameters to see if model quality is improving.
        '''

        XFull = X
        yFull = y
        self.Xe_train, self.Xe_test, self.ys_train, self.ys_test = \
        train_test_split(XFull, yFull.ravel(),test_size = self.test_size, random_state=self.seed,shuffle=True)

        if indicator == 1:
            ## just fit lightgbm once to obtain the AUC w.r.t a fixed set of hyper-parameters ##
            model = LGBMClassifier(random_state=self.seed,
                                   min_data=1,
                                   min_data_in_bin=1)
            model.set_params(**self.param_space)
            model.fit(self.Xe_train, self.ys_train)
            mypreds = model.predict_proba(self.Xe_test)[:, 1]
            auc = auc_metric(self.ys_test.reshape(-1, 1),
                             mypreds.reshape(-1, 1))
            return auc
        else:
            trials = Trials()
            best = fmin(fn=self.gbc_objective,
                        space=self.param_space,
                        algo=tpe.suggest,
                        trials=trials,
                        max_evals=self.max_evaluations)
            params = space_eval(self.param_space, best)
            self.best_params = params
            return params, 1 - np.min([x['loss'] for x in trials.results])
Пример #7
0
    def search(self):
        """
        Search the parameter grid
        """
        self.__start_now = datetime.datetime.now()
        self.__reset()

        if self.verbose > 0:
            print("Starting HyperOpt Parameter Search")

        best = fmin(fn=self.__objective,
                    space=self.__space,
                    algo=self.__algo,
                    max_evals=self.iters,
                    trials=self.__trial,
                    verbose=self.verbose,
                    timeout=self.time_to_search)

        if self.verbose > 0:
            print("Finished HyperOpt Parameter Search")

        if self.best_score > self.__init_score:
            self.best_params = self.__trial.best_trial["result"]["params"]
        else:
            self.best_params = self.__initparams
Пример #8
0
def nu_simple_fmin(hpo_project_key, objective, rseed=1337, full_model_string=None, notebook_name=None, verbose=True, stack=3, keep_temp=False, data_args=None):
    
    # db에서 가져오기
    db_info = asyncio.run(Requests().get_action(parameter1 = hpo_project_key, parameter2 = "null", url = hpo_url))[0]
    hpo_project_id = db_info["hpoProjectId"]
    algo, space = __transform_db_to_function(method = db_info["method"], config = db_info["config"])

    trials = Trials()
    best = fmin(objective, space, algo=algo, max_evals=50, trials=trials, rstate=np.random.RandomState(rseed), return_argmin=True)
    importances = calculate_importance(trials)
 
    # 저장 api
    all_info = dict()
    
    all_info["best_result"] = trials.best_trial['result'] 
    all_info["best_hp"] = best
    all_info["trial_result"] = trials.results
    all_info["trial_hp"] = trials.vals
    # json int64 때문에 작업
    all_info = __to_int(all_info)

    all_info["hpo_project_key"] = hpo_project_key

    tmp_importance = list()
    for i in range(len(importances)):
        for key1, value1 in importances[i].items():
            for key2, value2 in value1.items(): 
                tmp_importance.append(value2)
                break
    all_info["importances"] = tmp_importance

    asyncio.run(Requests().post_action(request_datas = all_info, url = hpo_url))

    return best, trials
Пример #9
0
def nu_fmin(hpo_project_key, objective, space, algo, max_evals, trials, rseed=1337, full_model_string=None, notebook_name=None, verbose=True, stack=3, keep_temp=False, data_args=None):
        
    best = fmin(objective, space, algo=algo, max_evals=max_evals, trials=trials, rstate=np.random.RandomState(rseed), return_argmin=True)
    importances = calculate_importance(trials)

    all_info = dict()
    
    all_info["best_result"] = trials.best_trial['result'] 
    all_info["best_hp"] = best
    all_info["trial_result"] = trials.results
    all_info["trial_hp"] = trials.vals
    # json int64 때문에 작업
    all_info = __to_int(all_info)
    
    all_info["hpo_project_key"] = hpo_project_key

    method, config = __transform_function_to_db(algo, space)
    all_info["method"] = method
    all_info["config"] = config

    tmp_importance = list()
    for i in range(len(importances)):
        for key1, value1 in importances[i].items():
            for key2, value2 in value1.items(): 
                tmp_importance.append(value2)
                break
    all_info["importances"] = tmp_importance

    asyncio.run(Requests().post_action(request_datas = all_info, url = hpo_url))

    return best
Пример #10
0
    def optimize(self, trials):
        space = {
            'n_estimators':
            500,
            'boosting_type':
            'dart',
            'num_leaves':
            hp.choice('num_leaves', np.arange(20, 60, dtype=int)),
            'max_depth':
            hp.choice('max_depth', np.arange(1, 30, dtype=int)),
            'min_data':
            hp.choice('min_data', np.arange(5, 40, dtype=int)),
            'max_bin':
            hp.choice('max_bin', np.arange(200, 300, dtype=int)),
            'rate_drop':
            hp.uniform('rate_drop', 0, 1),
            'skip_drop':
            hp.uniform('skip_drop', 0, 1),
            # 'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
            'eta':
            hp.loguniform('eta', -3.5, -1.5),
            'min_child_weight':
            hp.quniform('min_child_weight', 1, 6, 1),
            # 'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
            'subsample':
            hp.uniform('subsample', 0.5, 1),
            'alpha':
            hp.uniform('alpha', 0, 0.5),
            'lambda':
            hp.uniform('lambda', 1e-4, 1),
            'scale_pos_weight':
            hp.choice('scale_pos_weight',
                      [0.1, 1., 10, 100, 1e3, 1e4, 1e5, 1e6]),
            'colsample_bytree':
            hp.quniform('colsample_bytree', 0.5, 1, 0.05),
            # 'num_class': 2,
            'objective':
            'binary',
            'metric':
            'auc',
            'nthread':
            0,
            'verbose':
            1
        }

        best = fmin(self.scorer,
                    space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=100)
        best = space_eval(space, best)

        for k, v in best.items():
            if k in ['max_depth', 'num_leaves', 'min_data', 'max_bin']:
                v = int(v)
            space[k] = v

        return space
Пример #11
0
    def optimize(self, trials):
        space = {
            'n_estimators':
            1000,
            'booster':
            'dart',
            'sample_type':
            hp.choice('sample_type', ['uniform', 'weighted']),
            'normalize_type':
            hp.choice('normalize_type', ['tree', 'forest']),
            'rate_drop':
            hp.uniform('rate_drop', 0, 1),
            'skip_drop':
            hp.uniform('skip_drop', 0, 1),
            # 'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
            'eta':
            hp.loguniform('eta', -3.5, -1.5),
            'max_depth':
            hp.choice('max_depth', np.arange(1, 30, dtype=int)),
            'min_child_weight':
            hp.quniform('min_child_weight', 1, 6, 1),
            # 'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
            'subsample':
            hp.uniform('subsample', 0.5, 1),
            'alpha':
            hp.uniform('alpha', 0, 0.5),
            'lambda':
            hp.uniform('lambda', 1e-4, 1),
            'scale_pos_weight_multiplier':
            hp.choice('scale_pos_weight_multiplier',
                      [0, 0.1, 1., 10, 100, 1e3, 1e4, 1e5, 1e6]),
            'gamma':
            hp.quniform('gamma', 0.5, 1, 0.05),
            'colsample_bytree':
            hp.quniform('colsample_bytree', 0.5, 1, 0.05),
            # 'num_class': 2,
            'objective':
            'binary:logistic',
            'eval_metric':
            'auc',
            'nthread':
            0,
            'silent':
            1
        }

        best = fmin(self.scorer,
                    space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=100)
        best = space_eval(space, best)

        for k, v in best.items():
            if k == 'max_depth':
                v = int(v)
            space[k] = v

        return space
Пример #12
0
    def search(self):
        best = fmin(fn=self.objective_model,
                    space=self.space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals)

        #Changing the Python data type of dict values
        datatypes = {x[1]['label']: x[2] for x in self.params}
        for key, value in best.items():
            best[key] = eval(datatypes[key])(value)
        return self.cv_params['estimator'].set_params(**best)
Пример #13
0
 def tpe_xi(self):
     """
     Tree-structured Parzen Estimator Bayesian optimisation technique 
     to find optimal value of xi
     """
     best = fmin(
         fn=lambda xi: dixons_coles(df_past).optimise_xi(xi),
         space=hp.uniform(
             'xi', 0.0, 1.0
         ),  #does this create a dictionary that causes problems further down the line
         algo=tpe.suggest,
         max_evals=1)
     return best
Пример #14
0
def main():

    space = {
        'ltr':
        hp.choice('ltr', [True]),
        'shuffle':
        hp.choice('shuffle', [False]),
        'num_leaves':
        hp.choice('num_leaves', list(np.arange(8, 256, 2, dtype=int))),
        'max_depth':
        hp.choice('max_depth', list(np.arange(4, 64, 2, dtype=int))),
        'max_bin':
        hp.choice('max_bin', list(np.arange(255, 255 * 4, 5, dtype=int))),
        'min_data_in_leaf':
        hp.choice('min_data_in_leaf', list(np.arange(5, 100, 5, dtype=int))),
        'learning_rate':
        hp.uniform('learning_rate', 0.01, 0.3),
        'bagging_fraction':
        hp.uniform('bagging_fraction', 0.2, 1.0),
        'feature_fraction':
        hp.uniform('feature_fraction', 0.2, 1.0),
        'early_stopping':
        hp.uniform('test_size', 100, 1000),
    }

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(
            open(BASE_PATH + SET + TRAILKEY + '.hyperopt', "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                trials=trials,
                max_evals=max_trials)

    print("Best:", best)
    print("Num:", max_trials)

    # save the trials object
    with open(BASE_PATH + SET + TRAILKEY + ".hyperopt", "wb") as f:
        pickle.dump(trials, f)
Пример #15
0
    def fit(self, X, y, indicator):
        '''
        indicator=1 means we intend to do just sampling and one-time fitting
        for evaluating a fixed set of hyper-parameters, 
        0 means run hyperopt to search in the neighborhood of the seed 
        hyper-parameters to see if model quality is improving.
        '''
        num_samples = len(X)
        print('AutoGBT[AutoHyperOptimizer]:Total samples passed for'\
              'hyperparameter tuning:',num_samples)
        if num_samples > self.max_samples:
            removeperc = 1.0 - (float(self.max_samples) / num_samples)
            print ('AutoGBT[AutoHyperOptimizer]:Need to downsample for managing time:,'\
                   'I will remove data percentage',removeperc)
            XFull, yFull = Utils.random_sample_in_order(
                X, y.reshape(-1, 1), removeperc)
            print('AutoGBT[AutoHyperOptimizer]:downsampled data length',
                  len(XFull))
        else:
            XFull = X
            yFull = y

        self.Xe_train, self.Xe_test, self.ys_train, self.ys_test = \
        train_test_split(XFull, yFull.ravel(),test_size = self.test_size, random_state=self.seed,shuffle=True)

        if indicator == 1:
            ## just fit lightgbm once to obtain the AUC w.r.t a fixed set of hyper-parameters ##
            model = lgbm.LGBMClassifier(random_state=self.seed,
                                        min_data=1,
                                        min_data_in_bin=1)
            model.set_params(**self.param_space)
            model.fit(self.Xe_train, self.ys_train)
            mypreds = model.predict_proba(self.Xe_test)[:, 1]
            auc = auc_metric(self.ys_test.reshape(-1, 1),
                             mypreds.reshape(-1, 1))
            return auc
        else:
            trials = Trials()
            best = fmin(fn=self.gbc_objective,
                        space=self.param_space,
                        algo=tpe.suggest,
                        trials=trials,
                        max_evals=self.max_evaluations)
            params = space_eval(self.param_space, best)
            print('AutoGBT[AutoHyperOptimizer]:Best hyper-parameters', params)
            self.best_params = params
            return params, 1 - np.min([x['loss'] for x in trials.results])
Пример #16
0
    def fit(self, X, y, indicator):
        num_samples = len(X)
        print('Total samples passed for' \
              'hyperparameter tuning:', num_samples)
        if num_samples > self.max_samples:
            removeperc = 1.0 - (float(self.max_samples) / num_samples)
            print ('Need to downsample for managing time:,' \
                   'I will remove data percentage', removeperc)
            XFull, yFull = self.random_sample_in_order(X, y.reshape(-1, 1), removeperc)
            print('downsampled data length', len(XFull))
        else:
            XFull = X
            yFull = y

        Xe_train, self.Xe_test, ys_train, self.ys_test = \
            train_test_split(XFull, yFull.ravel(), test_size=self.test_size, random_state=self.seed, shuffle=True)

        self.lgb_train = lgb.Dataset(Xe_train, ys_train)#, free_raw_data=True)
        self.lgb_val = lgb.Dataset(self.Xe_test, self.ys_test)#, free_raw_data=True, reference=self.lgb_train)
        del X
        del y
        del Xe_train, ys_train
        gc.collect()
        if indicator == 1:
            ## just fit lightgbm once to obtain the AUC w.r.t a fixed set of hyper-parameters ##
            model = lgb.train(self.param_space, self.lgb_train, valid_sets=self.lgb_val, valid_names='eval',
                              verbose_eval=False, early_stopping_rounds=30, num_boost_round=100)
            fpr, tpr, thresholds = metrics.roc_curve(
                self.ys_test, model.predict(self.Xe_test, num_iteration=model.best_iteration))
            auc = metrics.auc(fpr, tpr)
            print("FIX AUC is", auc)
            return auc
        else:
            trials = Trials()
            best = fmin(fn=self.gbc_objective, space=self.param_space, algo=tpe.suggest, trials=trials,
                        max_evals=self.max_evaluations)
            params = space_eval(self.param_space, best)
            print('Best hyper-parameters', params)
            self.best_params = params
            model = lgb.train(params, self.lgb_train, valid_sets=self.lgb_val, valid_names='eval',
                              verbose_eval=False, early_stopping_rounds=30, num_boost_round=100)
            fpr, tpr, thresholds = metrics.roc_curve(
                self.ys_test, model.predict(self.Xe_test, num_iteration=model.best_iteration))
            auc = metrics.auc(fpr, tpr)
            print("Tune AUC is", auc)
            return params, auc
Пример #17
0
    def hyperopt_lightgbm(self, df, drop_name, params):
        #X = df[]
        df = df[df.label.isnull() == False][:20000]
        feature_name = [i for i in df.columns if i not in (drop_name + ['label']) and 'target' not in i]
        X = df[feature_name]
        y = df[['label']]
        X_train, X_val, y_train, y_val = self.data_split(X, y, test_size=0.5)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        space = {
            #"learning_rate": hp.loguniform("learning_rate", np.log(0.03), np.log(0.1)),
            "max_depth": hp.choice("max_depth", [-1, 4, 5, 6]),
            "num_leaves": hp.choice("num_leaves", np.linspace(32, 128, 32, dtype=int)),
            "feature_fraction": hp.quniform("feature_fraction", 0.5, 0.8, 0.1),
            "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 0.8, 0.1),
            #"bagging_freq": hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)),
            "reg_alpha": hp.uniform("reg_alpha", 0, 2),
            "reg_lambda": hp.uniform("reg_lambda", 0, 2),
            #"min_child_weight": hp.uniform('min_child_weight', 0.5, 10),
        }

        def objective(hyperparams):
            model = lgb.train({**params, **hyperparams}, train_data, 30,
                            valid_data, early_stopping_rounds=30, verbose_eval=0)

            score = model.best_score["valid_0"][params["metric"]]

            # in classification, less is better
            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = fmin(fn=objective, space=space, trials=trials,
                            algo=tpe.suggest, max_evals=20, verbose=1,
                            rstate=np.random.RandomState(1))

        hyperparams = space_eval(space, best)
        print(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
        return hyperparams
Пример #18
0
    def hypersearch(self, params, **kwargs):

        # set additonal parameters which are the parameters already tunned and
        # removing those that are going to be tunned
        add_params = self.params.copy()
        for k, v in params.items():
            add_params.pop(k)

        # overwrite if something is passed as kwargs (only applicable to
        # add_params)
        for k, v in params.items():
            if k in kwargs.keys(): add_params[k] = kwargs[k]

        partial_objective = lambda params: lgb_objective(
            params, self.data, additional_params=add_params)

        lgb_objective.i = 0
        best = fmin(fn=partial_objective,
                    space=params,
                    algo=tpe.suggest,
                    max_evals=self.n_evals)

        return best
Пример #19
0
def search_hyperparams():
    start_t = time.time()

    print('\n')
    print('=' * 50)

    space = {
        'alpha': hp.uniform('alpha', 0.01, 1.0),
        'epsilon': hp.uniform('epsilon', 0.01, 1.0),
    }

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=N_ITER_SEARCH)

    run_t_min = (time.time() - start_t) / 60.0

    print(f'\nRunning time: {run_t_min:.2f} min.')
    print(f'\nBest sln: {best}.')

    print('=' * 50)
    print('\n')
Пример #20
0
def train(time_allowed=20, trials=None, output="model.pickle"):
    import xgboost as xgb
    import numpy
    from hyperopt import hp, tpe, Trials
    from hyperopt.fmin import fmin

    Y, *R, _ = get_files()
    X, tf, u = gen_features(*R)
    del R

    dtrain = xgb.DMatrix(X, Y)

    if trials is None:
        trials = Trials()

    param = {
        'max_depth': 2,
        'eta': .3,
        'verbosity': 0,
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist',
        "predictor": 'gpu_predictor'
    }
    num_round = 100
    nfold = 7

    def objective(params):
        params['max_depth'] = int(params['max_depth'])

        params.update(param)

        result = xgb.cv(params,
                        dtrain,
                        num_round,
                        nfold=nfold,
                        metrics={'auc'},
                        seed=0)

        score = max(result['test-auc-mean'] - result['test-auc-std'] / 2)

        return -score

    space = {
        'max_depth': hp.quniform('max_depth', 2, 12, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
        'gamma': hp.uniform('gamma', 0.01, 0.5),
        'subsample': hp.uniform('subsample', .3, 1),
        'scale_pos_weight': hp.uniform('scale_pos_weight', .8, 20.0),
        'eta': hp.uniform('eta', .01, .4),
    }

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                trials=trials,
                max_evals=len(trials.trials) + 25)

    param.update(best)
    param['max_depth'] = int(param['max_depth'])  #fixme

    print("re-cv to find early stopping")
    cv = xgb.cv(param, dtrain, num_round, nfold=nfold, metrics={'auc'}, seed=0)

    #num_round = (r['test-auc-mean'] - r['test-auc-std']/2).idxmax()
    num_round = (
        cv['test-auc-mean'] - cv['test-auc-std'] / 2 -
        numpy.linspace(0, cv.shape[0] * .0003, num=cv.shape[0])).idxmax()

    print("final train")
    model = xgb.train(param, dtrain, num_round)

    MODEL = (model, param, trials, tf, u)

    print("pickle to s3")
    client = boto3.client('s3')
    client.put_object(Body=gzip.compress(pickle.dumps(MODEL)),
                      Bucket=BUCKET,
                      Key=output,
                      ContentType='application/python-pickle',
                      ContentEncoding='gzip')

    #with open("model.pickle", "wb") as f:
    #    pickle.dump(MODEL, f)

    return MODEL
def find_goodModel(train, target):
    train = preProcessData(train)

    def objective(space):
        # clf = xgb.XGBRegressor(n_estimators = space['n_estimators'],
        #                        max_depth = space['max_depth'],
        #                        min_child_weight = space['min_child_weight'],
        #                        subsample = space['subsample'],
        #                        learning_rate = space['learning_rate'],
        #                        gamma = space['gamma'],
        #                        colsample_bytree = space['colsample_bytree'],
        #                        objective='reg:linear'
        #                        )

        clf = RandomForestClassifier(
            max_depth=space['max_depth'],
            min_samples_split=space['min_samples_split'],
            min_samples_leaf=space['min_samples_leaf'],
            bootstrap=space['bootstrap'],
            criterion=space['criterion'])

        # score =  A*SameLabel + B*Features +
        clf.fit(train, target)
        cross_mean_score = cross_val_score(estimator=clf,
                                           X=train,
                                           y=target,
                                           scoring='precision_macro',
                                           cv=3,
                                           n_jobs=-1).mean()

        result = {'loss': cross_mean_score, 'status': STATUS_OK}
        print " result is ", result
        return result

    col_train = train.columns
    bootStrapArr = [True, False]
    criterionArr = ["gini", "entropy"]
    space = {
        'max_depth':
        hp.choice('max_depth', np.arange(10, 30, dtype=int)),
        'min_samples_split':
        hp.choice('min_samples_split', np.arange(8, 15, dtype=int)),
        'min_samples_leaf':
        hp.choice('min_samples_leaf', np.arange(5, 15, dtype=int)),
        'bootstrap':
        hp.choice('bootstrap', bootStrapArr),
        'criterion':
        hp.choice('criterion', criterionArr)
    }

    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=3,  # change
        trials=trials)

    print(best)
    best['bootstrap'] = bootStrapArr[best['bootstrap']]
    best['criterion'] = criterionArr[best['criterion']]

    obj = {
        'predictions': makePredictions(best, train, train, target),
        'params': best,
        'STATUS': 'OK'
    }
    return obj
Пример #22
0
def lgb_hyperopt(data, labels, num_evals=1000, n_folds=5, diagnostic=False):
    """
    Function to turn parameters for Lightgbm
    """
    LGBM_MAX_LEAVES = 2**11  #maximum number of leaves per tree for LightGBM
    LGBM_MAX_DEPTH = 25  #maximum tree depth for LightGBM
    EVAL_METRIC_LGBM_CLASS = 'f1'

    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat)
        return 'f1', f1_score(y_true, y_hat), True

    print('Running {} rounds of LightGBM parameter optimisation:'.format(
        num_evals))
    #clear space

    integer_params = [
        'max_depth', 'num_leaves', 'max_bin', 'min_data_in_leaf',
        'min_data_in_bin'
    ]

    def objective(space_params):

        #cast integer params from float to int
        for param in integer_params:
            space_params[param] = int(space_params[param])

        #extract nested conditional parameters
        if space_params['boosting']['boosting'] == 'goss':
            top_rate = space_params['boosting'].get('top_rate')
            other_rate = space_params['boosting'].get('other_rate')
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            space_params['top_rate'] = top_rate
            space_params['other_rate'] = other_rate

        subsample = space_params['boosting'].get('subsample', 1.0)
        space_params['boosting'] = space_params['boosting']['boosting']
        space_params['subsample'] = subsample

        cv_results = lgb.cv(space_params,
                            train,
                            nfold=n_folds,
                            stratified=True,
                            early_stopping_rounds=100,
                            seed=42,
                            feval=lgb_f1_score)

        best_loss = -cv_results['f1-mean'][-1]

        return {'loss': best_loss, 'status': STATUS_OK}

    train = lgb.Dataset(data, labels)

    #integer and string parameters, used with hp.choice()
    boosting_list = [{
        'boosting': 'gbdt',
        'subsample': hp.uniform('subsample', 0.5, 1)
    }, {
        'boosting': 'goss',
        'subsample': 1.0,
        'top_rate': hp.uniform('top_rate', 0, 0.5),
        'other_rate': hp.uniform('other_rate', 0, 0.5)
    }]  #if including 'dart', make sure to set 'n_estimators'

    objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
    objective_list_class = ['binary', 'cross_entropy']
    objective_list = objective_list_class
    is_unbalance_list = [True]

    space = {
        'boosting': hp.choice('boosting', boosting_list),
        'num_leaves': hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
        'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
        'max_bin': hp.quniform('max_bin', 32, 255, 1),
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
        'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
        'min_gain_to_split': hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
        'lambda_l1': hp.uniform('lambda_l1', 0, 5),
        'lambda_l2': hp.uniform('lambda_l2', 0, 5),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.005),
                                       np.log(0.2)),
        'metric': None,
        'objective': hp.choice('objective', objective_list),
        'feature_fraction': hp.quniform('feature_fraction', 0.5, 1, 0.01),
        'bagging_fraction': hp.quniform('bagging_fraction', 0.5, 1, 0.01),
        'is_unbalance': hp.choice('is_unbalance', is_unbalance_list)
    }

    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=num_evals,
                trials=trials)

    #fmin() will return the index of values chosen from the lists/arrays in 'space'
    #to obtain actual values, index values are used to subset the original lists/arrays
    #extract nested conditional parameters
    try:
        if best['boosting']['boosting'] == 'goss':
            top_rate = best['boosting'].get('top_rate')
            other_rate = best['boosting'].get('other_rate')
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            best['top_rate'] = top_rate
            best['other_rate'] = other_rate
    except:
        if boosting_list[best['boosting']]['boosting'] == 'goss':
            top_rate = best['top_rate']
            other_rate = best['other_rate']
            #0 <= top_rate + other_rate <= 1
            top_rate = max(top_rate, 0)
            top_rate = min(top_rate, 0.5)
            other_rate = max(other_rate, 0)
            other_rate = min(other_rate, 0.5)
            best['top_rate'] = top_rate
            best['other_rate'] = other_rate
    best['boosting'] = boosting_list[best['boosting']][
        'boosting']  #nested dict, index twice
    best['metric'] = metric_list[best['metric']]
    best['objective'] = objective_list[best['objective']]
    best['is_unbalance'] = is_unbalance_list[best['is_unbalance']]

    #cast floats of integer params to int
    for param in integer_params:
        best[param] = int(best[param])

    print('{' + '\n'.join('{}: {}'.format(k, v)
                          for k, v in best.items()) + '}')
    if diagnostic:
        return (best, trials)
    else:
        return (best)
Пример #23
0
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

X, y = datasets.load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y)


def objective(space):
    clf = SVR(C=space['C'], epsilon=space['epsilon'])
    #return -1.0 * cross_val_score(clf, X, y, scoring='neg_mean_squared_error', cv=5).mean()
    return cross_val_score(clf, X, y, scoring='neg_mean_squared_error',
                           cv=5).mean()


space = {
    'C': hp.quniform('C', 10.0, 5000.0, 10.0),
    'epsilon': hp.quniform('epsilon', 0.1, 10.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,
            verbose=3)

print best
Пример #24
0
def hyper_optimize(data_x,data_y,val_x=None,val_y=None,cat_labels=None,space=None,max_evals=20):
	from hyperopt import tpe, hp, space_eval
	from hyperopt.fmin import fmin
	from hyperopt.pyll.base import scope
	from sklearn.metrics import make_scorer, f1_score
	from sklearn.model_selection import cross_val_score, StratifiedKFold

	average="binary"
	if space is not None:
		if "average" in space:
			average = "micro"

	def f1_sklearn(truth,predictions):
		if space is not None:
			if "average" in space:
				return -f1_score(truth,predictions)
		return -f1_score(truth,predictions,average=average)

	f1_scorer = make_scorer(f1_sklearn)

	def objective(in_params):
		clf = in_params['clf']
		if clf == "rf":
			clf = RandomForestClassifier(n_jobs=4,random_state=42)
		elif clf == "gbm":
			clf = GradientBoostingClassifier(random_state=42)
		elif clf == "xgb":
			from xgboost import XGBClassifier
			clf = XGBClassifier(random_state=42,nthread=4)
		elif clf == "cat":
			from catboost import CatBoostClassifier
			clf = CatBoostClassifier(random_state=42)
		else:
			clf = ExtraTreesClassifier(n_jobs=4,random_state=42)

		if clf.__class__.__name__ == "XGBClassifier":
			params = {
				'n_estimators': int(in_params['n_estimators']),
				'max_depth': int(in_params['max_depth']),
				'eta': float(in_params['eta']),
				'gamma': float(in_params['gamma']),
				'colsample_bytree': float(in_params['colsample_bytree']),
				'subsample': float(in_params['subsample'])
			}
			if "average" in in_params:
				params["average"] = in_params["average"]
		else:
			params = {
				'n_estimators': int(in_params['n_estimators']),
				'max_depth': int(in_params['max_depth']),
				'min_samples_split': int(in_params['min_samples_split']),
				'min_samples_leaf': int(in_params['min_samples_leaf']),
				'max_features': in_params['max_features']
			}

		clf.set_params(**params)
		if val_x is None:  # No fixed validation set given, perform cross-valiation on train
			score = cross_val_score(clf, data_x, data_y, scoring=f1_scorer, cv=StratifiedKFold(n_splits=3), n_jobs=3).mean()
		else:  # validated on validation set
			clf.fit(data_x,data_y)
			pred_y = clf.predict(val_x)
			score = -f1_score(val_y,pred_y)
		if "Forest" in clf.__class__.__name__:
			shortname = "RF"
		elif "Cat" in clf.__class__.__name__:
			shortname = "CAT"
		elif "XG" in clf.__class__.__name__:
			shortname = "XGB"
		else:
			shortname = "ET" if "Extra" in clf.__class__.__name__ else "GBM"
		print("F1 {:.3f} params {} {}".format(-score, params, shortname))
		return score

	# For large corpora, consider raising max n_estimators up to 350
	if space is None:
		space = {
			'n_estimators': scope.int(hp.quniform('n_estimators', 75, 250, 10)),
			'max_depth': scope.int(hp.quniform('max_depth', 5, 40, 1)),
			'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
			'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
			'max_features': hp.choice('max_features', ["sqrt", None, 0.5, 0.6, 0.7, 0.8]),
			'clf': hp.choice('clf', ["rf","et","gbm"])
		}

	sys.stderr.write("o Using "+str(data_x.shape[0])+" tokens to choose hyperparameters\n")
	if val_x is not None:
		sys.stderr.write("o Using "+str(val_x.shape[0])+" held out tokens as fixed validation data\n")
	else:
		sys.stderr.write("o No validation data provided, using cross-validation on train set to score\n")

	best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals)

	best_params = space_eval(space,best_params)
	sys.stderr.write(str(best_params) + "\n")

	best_clf = best_params['clf']
	if best_clf == "rf":
		best_clf = RandomForestClassifier(n_jobs=4,random_state=42)
	elif best_clf == "gbm":
		best_clf = GradientBoostingClassifier(random_state=42)
	elif best_clf == "xgb":
		from xgboost import XGBClassifier
		best_clf = XGBClassifier(random_state=42,nthread=4)
	else:
		best_clf = ExtraTreesClassifier(n_jobs=4,random_state=42)
	del best_params['clf']
	best_clf.set_params(**best_params)

	return best_clf, best_params
Пример #25
0
def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, eval_metric='mae', diagnostic=False):
    
    # LightGBM
    if package=='lgbm':
        print(f'Running {num_evals} rounds of LightGBM parameter optimisation:')
        #clear space
        gc.collect()
        
        integer_params = ['max_depth', 'num_leaves', 'min_data_in_leaf', 'bagging_freq']
        if eval_metric=='mae':
            metric = 'mae'
        elif eval_metric=='mse':
            metric = 'mse'
        else:
            print(f'Metric {eval_metric} not found. Falling back to mae.')
            metric = 'mae'
            
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            cv_results = lgb.cv(space_params,
                                train,
                                nfold=N_FOLDS,
                                stratified=False,
                                early_stopping_rounds=200,
                                metrics=metric,
                                seed=42)
            if metric=='mae':
                best_loss = cv_results['l1-mean'][-1]
            elif metric=='mse':
                best_loss = cv_results['l2-mean'][-1]

            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels)
                
        #integer and string parameters, used with hp.choice()
        objective_list = ['huber', 'gamma', 'fair']
        space ={
                'boosting' : 'gbdt',
                'num_leaves' : hp.quniform('num_leaves', 8, 92, 4),
                'max_depth': hp.quniform('max_depth', -1, 16, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 100, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0.1, 0.95),
                'reg_lambda' : hp.uniform('reg_lambda', 0.1, 0.95),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : 'mae',
                'objective' : hp.choice('objective', objective_list),
                'bagging_fraction' : hp.uniform('bagging_fraction', 0.5, 0.95),
                'bagging_freq': hp.quniform('bagging_freq', 3, 7, 1)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')             
Пример #26
0
        for k in params:
            parameters[k] = int(params[k])
        mdl = GradientBoostingRegressor(random_state=0, **parameters)
        score = cross_val_score(mdl, X_train, y_train, scoring=gini_scorer, cv=5).mean()
        print("Gini {:.3f} params {}".format(score, parameters))
        mlflow.end_run()
        return score

    # need to match estimator
    space = {
        'n_estimators': hp.quniform('n_estimators', 10, 100, 5),  # low # high # number of choices
        'max_depth': hp.quniform('max_depth', 2, 4, 2) 
    }

    best_params = fmin(fn=objective_gbr,
                space=space,
                algo=tpe.suggest,
                max_evals=5)
    
    for key in best_params.keys():
        if int(best_params[key]) == best_params[key]:
           best_params[key] = int(best_params[key])

    print("Hyperopt estimated optimum {}".format(best_params))
        
else:
    best_params = {
        'n_estimators': 25, 
        'max_depth': 2
    }

reg = GradientBoostingRegressor(random_state=0, **best_params)
Пример #27
0
                              learning_rate=0.05,
                              n_estimators=900,
                              random_state=9,
                              **params)
        print(X_sub_check.shape)
        sc = cv_lgboost(m, X_base=X_sub_check, y=y)
        print("Score {:.3f} params {}".format(sc, params))
        return sc

    space = {
        # 'max_depth': hp.quniform('max_depth', 2, 8, 1),
        'num_leaves': hp.quniform('num_leaves', 4, 50, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.005, 0.9),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 1.0),
        'lambda_l1': hp.uniform('lambda_l1', 0.1, 80)
        # "min_data_in_leaf": hp.loguniform("min_data_in_leaf",1,8)
    }
    best_lgbm = fmin(fn=objective,
                     space=space,
                     algo=tpe.suggest,
                     max_evals=80,
                     trials=trials)
    losses = [
        trials.trials[i]['result']['loss'] for i in range(len(trials.trials))
    ]
    params = pd.DataFrame(trials.vals)
    params['loss'] = losses
    params.sort_values('loss', inplace=True)
    params.to_csv(fpath_csv)
    with open(fpath, 'wb') as f:
        pickle.dump(trials, f)
Пример #28
0
def _build_lgb_model(X: pd.DataFrame, y: pd.Series, is_discrete: bool,
                     num_class: int, n_jobs: int,
                     opts: Dict[str, str]) -> Tuple[Any, float]:
    import lightgbm as lgb  # type: ignore[import]

    def _get_option_value(*args) -> Any:  # type: ignore
        return get_option_value(opts, *args)

    if is_discrete:
        objective = "binary" if num_class <= 2 else "multiclass"
    else:
        objective = "regression"

    fixed_params = {
        "boosting_type": _get_option_value(*_opt_boosting_type),
        "objective": objective,
        "class_weight": _get_option_value(*_opt_class_weight),
        "learning_rate": _get_option_value(*_opt_learning_rate),
        "max_depth": _get_option_value(*_opt_max_depth),
        "max_bin": _get_option_value(*_opt_max_bin),
        "reg_alpha": _get_option_value(*_opt_reg_alpha),
        "min_split_gain": _get_option_value(*_opt_min_split_gain),
        "n_estimators": _get_option_value(*_opt_n_estimators),
        "importance_type": _get_option_value(*_opt_importance_type),
        "random_state": 42,
        "n_jobs": n_jobs
    }

    # Set `num_class` only in the `multiclass` mode
    if objective == "multiclass":
        fixed_params["num_class"] = num_class

    model_class = lgb.LGBMClassifier if is_discrete \
        else lgb.LGBMRegressor

    def _create_model(params: Dict[str, Any]) -> Any:
        # Some params must be int
        for k in ["num_leaves", "subsample_freq", "min_child_samples"]:
            if k in params:
                params[k] = int(params[k])
        p = copy.deepcopy(fixed_params)
        p.update(params)
        return model_class(**p)

    from hyperopt import hp, tpe, Trials  # type: ignore[import]
    from hyperopt.early_stop import no_progress_loss  # type: ignore[import]
    from hyperopt.fmin import fmin  # type: ignore[import]
    from sklearn.model_selection import (  # type: ignore[import]
        cross_val_score, KFold, StratifiedKFold)

    # TODO: Temporality supress `sklearn.model_selection` user's warning
    import warnings
    warnings.simplefilter("ignore", UserWarning)

    # Forcibly disable INFO-level logging in the `hyperopt` module
    from logging import getLogger, WARN
    getLogger("hyperopt").setLevel(WARN)

    param_space = {
        "num_leaves": hp.quniform("num_leaves", 2, 100, 1),
        "subsample": hp.uniform("subsample", 0.5, 1.0),
        "subsample_freq": hp.quniform("subsample_freq", 1, 20, 1),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.01, 1.0),
        "min_child_samples": hp.quniform("min_child_samples", 1, 50, 1),
        "min_child_weight": hp.loguniform("min_child_weight", -3, 1),
        "reg_lambda": hp.loguniform("reg_lambda", -2, 3)
    }

    scorer = "f1_macro" if is_discrete else "neg_mean_squared_error"
    n_splits = int(_get_option_value(*_opt_n_splits))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True) if is_discrete \
        else KFold(n_splits=n_splits, shuffle=True)

    def _objective(params: Dict[str, Any]) -> float:
        model = _create_model(params)
        fit_params: Dict[str, str] = {
            # TODO: Raises an error if a single regressor is used
            # "categorical_feature": "auto",
        }
        try:
            # TODO: Replace with `lgb.cv` to remove the `sklearn` dependency
            scores = cross_val_score(model,
                                     X,
                                     y,
                                     scoring=scorer,
                                     cv=cv,
                                     fit_params=fit_params,
                                     n_jobs=n_jobs)
            return -scores.mean()

        # it might throw an exception because `y` contains
        # previously unseen labels.
        except Exception as e:
            _logger.warning(f"{e.__class__}: {e}")
            return 0.0

    def _early_stop_fn() -> Any:
        no_progress_loss_fn = no_progress_loss(
            int(_get_option_value(*_opt_no_progress_loss)))
        timeout = int(_get_option_value(*_opt_timeout))
        if timeout <= 0:
            return no_progress_loss_fn

        # Set base time for budget mechanism
        start_time = time.time()

        def timeout_fn(trials,
                       best_loss=None,
                       iteration_no_progress=0):  # type: ignore
            no_progress_loss, meta = no_progress_loss_fn(
                trials, best_loss, iteration_no_progress)
            to = time.time() - start_time > timeout
            return no_progress_loss or to, meta

        return timeout_fn

    try:
        trials = Trials()
        max_evals = int(_get_option_value(*_opt_max_evals))
        best_params = fmin(fn=_objective,
                           space=param_space,
                           algo=tpe.suggest,
                           trials=trials,
                           max_evals=max_evals,
                           early_stop_fn=_early_stop_fn(),
                           rstate=np.random.RandomState(42),
                           show_progressbar=False,
                           verbose=False)

        _logger.info("hyperopt: #eval={}/{}".format(len(trials.trials),
                                                    max_evals))

        # Builds a model with `best_params`
        # TODO: Could we extract constraint rules (e.g., FD and CFD) from built statistical models?
        model = _create_model(best_params)
        model.fit(X, y)

        def _feature_importances() -> List[Any]:
            f = filter(lambda x: x[1] > 0.0,
                       zip(model.feature_name_, model.feature_importances_))
            return list(sorted(f, key=lambda x: x[1], reverse=True))

        _logger.debug(
            f"lightgbm: feature_importances={_feature_importances()}")

        sorted_lst = sorted(trials.trials, key=lambda x: x['result']['loss'])
        min_loss = sorted_lst[0]['result']['loss']
        return model, -min_loss
    except Exception as e:
        _logger.warning(f"Failed to build a stat model because: {e}")
        return None, 0.0
Пример #29
0
        results.append(loss)

    error = np.mean(results)
    print("INFO: iteration {} error {:.3f}".format(xl_objective.i, error))

    return error

# LINEAR
partial_objective = lambda params: xl_objective(
    params,
    method="linear")
xl_objective.i = 0
start = time()
best_linear = fmin(
    fn=partial_objective,
    space=xl_parameter_space,
    algo=tpe.suggest,
    max_evals=10
    )
end = time()-start
print("{} min".format(round(end/60,3)))
pickle.dump(best_linear, open(os.path.join(XLEARN_DIR,'best_linear.p'), "wb"))

# FM
partial_objective = lambda params: xl_objective(
    params,
    method="fm")
xl_objective.i = 0
start = time()
best_fm = fmin(
    fn=partial_objective,
    space=xl_parameter_space,
Пример #30
0
    def tune(self, decoy_peaks, target_peaks, use_main_score=True):
        def objective(params):
            params = {
                'eta': "{:.3f}".format(params['eta']),
                'gamma': "{:.3f}".format(params['gamma']),
                'max_depth': int(params['max_depth']),
                'min_child_weight': int(params['min_child_weight']),
                'subsample': "{:.3f}".format(params['subsample']),
                'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
                'colsample_bylevel': '{:.3f}'.format(params['colsample_bylevel']),
                'colsample_bynode': '{:.3f}'.format(params['colsample_bynode']),
                'lambda': "{:.3f}".format(params['lambda']),
                'alpha': "{:.3f}".format(params['alpha']),
                'scale_pos_weight': "{:.3f}".format(params['scale_pos_weight']),
            }
            
            clf = xgb.XGBClassifier(random_state=42, silent=1, objective='binary:logitraw', eval_metric='auc', **params)

            score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=np.random.RandomState(42))).mean()
            # click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params))
            return score

        click.echo("Info: Autotuning of XGB hyperparameters.")

        assert isinstance(decoy_peaks, Experiment)
        assert isinstance(target_peaks, Experiment)

        X0 = decoy_peaks.get_feature_matrix(use_main_score)
        X1 = target_peaks.get_feature_matrix(use_main_score)
        X = np.vstack((X0, X1))
        y = np.zeros((X.shape[0],))
        y[X0.shape[0]:] = 1.0

        # Tune complexity hyperparameters
        xgb_params_complexity = self.xgb_params_tuned
        xgb_params_complexity.update({k: self.xgb_params_space[k] for k in ('max_depth', 'min_child_weight')})

        best_complexity = fmin(fn=objective, space=xgb_params_complexity, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
        best_complexity['max_depth'] = int(best_complexity['max_depth'])
        best_complexity['min_child_weight'] = int(best_complexity['min_child_weight'])

        self.xgb_params_tuned.update(best_complexity)

        # Tune gamma hyperparameter
        xgb_params_gamma = self.xgb_params_tuned
        xgb_params_gamma['gamma'] = self.xgb_params_space['gamma']

        best_gamma = fmin(fn=objective, space=xgb_params_gamma, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))

        self.xgb_params_tuned.update(best_gamma)

        # Tune subsampling hyperparameters
        xgb_params_subsampling = self.xgb_params_tuned
        xgb_params_subsampling.update({k: self.xgb_params_space[k] for k in ('subsample', 'colsample_bytree', 'colsample_bylevel', 'colsample_bynode')})

        best_subsampling = fmin(fn=objective, space=xgb_params_subsampling, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))

        self.xgb_params_tuned.update(best_subsampling)

        # Tune regularization hyperparameters
        xgb_params_regularization = self.xgb_params_tuned
        xgb_params_regularization.update({k: self.xgb_params_space[k] for k in ('lambda', 'alpha')})

        best_regularization = fmin(fn=objective, space=xgb_params_regularization, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))

        self.xgb_params_tuned.update(best_regularization)

        # Tune learning rate
        xgb_params_learning = self.xgb_params_tuned
        xgb_params_learning['eta'] = self.xgb_params_space['eta']

        best_learning = fmin(fn=objective, space=xgb_params_learning, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))

        self.xgb_params_tuned.update(best_learning)
        click.echo("Info: Optimal hyperparameters: {}".format(self.xgb_params_tuned))

        self.xgb_params = self.xgb_params_tuned

        return self
Пример #31
0
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000,
                                     20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}

global ITERATION
ITERATION = 0

df = pd.read_pickle('feature.pkl')

train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
print("Starting LightGBM. Train shape: {}, test shape: {}".format(
    train_df.shape, test_df.shape))
feats = [
    f for f in train_df.columns if f not in
    ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
]

bayes_trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10000,
            trials=bayes_trials,
            rstate=np.random.RandomState(50))
bayes_trials_results = sorted(bayes_trials.results, key=lambda x: x['loss'])
bayes_trials_results[:2]
Пример #32
0
	return 1-score

lgb_parameter_space = {
	'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
	'num_boost_round': hp.quniform('num_boost_round', 20, 100, 5),
	'num_leaves': hp.quniform('num_leaves', 32,256,4),
    'min_child_weight': hp.quniform('min_child_weight', 1, 50, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
    'subsample': hp.uniform('subsample', 0.5, 1.),
    'reg_alpha': hp.uniform('reg_alpha', 0.01, 1.),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 1.),
}

lgb_objective.i = 0
best = fmin(fn=lgb_objective,
            space=lgb_parameter_space,
            algo=tpe.suggest,
            max_evals=10)

best['num_boost_round'] = int(best['num_boost_round'])
best['num_leaves'] = int(best['num_leaves'])
best['verbose'] = -1

# Read the validation coupon features
df_coupons_valid_feat = pd.read_pickle(os.path.join(inp_dir, 'valid', 'df_coupons_valid_feat.p'))
df_coupons_valid_cat_feat = df_coupons_valid_feat.drop(drop_cols, axis=1)

# Read the interactions during validation
interactions_valid_dict = pickle.load(
    open("../datasets/Ponpare/data_processed/valid/interactions_valid_dict.p", "rb"))

# Take the 358 validation coupons and the 6070 users seen in training and during