示例#1
0
 def fit(self, X, y):
     self.model = XGBModel(n_estimators=self.n_best, **self.params)
     self.model.fit(X=X[self.features],
                    y=y,
                    eval_metric="mae",
                    verbose=False)
     return self
示例#2
0
        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams)
            model.fit(X=X_trn, y=y_trn,
                      eval_set=[(X_val, y_val)],
                      eval_metric=self.metric,
                      early_stopping_rounds=self.n_stop,
                      verbose=False)
            score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}
示例#3
0
        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est,
                             **self.params,
                             **hyperparams)
            model.fit(
                X=X_trn,
                y=y_trn,
                eval_set=[(X_val, y_val)],
                eval_metric=self.metric,
                early_stopping_rounds=self.n_stop,
                verbose=False,
            )
            score = (model.evals_result()["validation_0"][self.metric][
                model.best_iteration] * self.loss_sign)

            return {"loss": score, "status": STATUS_OK, "model": model}
示例#4
0
    def _train_test(
        x: pd.DataFrame,
        y: pd.Series,
        model: xgb.XGBModel,
        scorer: Callable,
        test_samples: Optional[int] = None,
        test_ratio: Optional[float] = None,
        time_series: bool = False,
        random_state: Optional[int] = None,
    ):
        datasets = split(x, y, test_samples, test_ratio, time_series,
                         random_state)
        x_train, x_test, y_train, y_test = datasets

        model.fit(x_train, y_train)
        return scorer(model, x_test, y_test)
示例#5
0
def create_name_for_xgbmodel(run_cfg: Dict[str, Any],
                             outer_split_num: int,
                             model: XGBModel,
                             inner_split_num: int,
                             prefix_location='logs/',
                             suffix='.pkl') -> str:
    if run_cfg['analysis_type'] == AnalysisType.FLATTEN_CORRS:
        model_str_representation = run_cfg['analysis_type'].value
        for key in [
                'colsample_bylevel', 'colsample_bynode', 'colsample_bytree',
                'gamma', 'learning_rate', 'max_depth', 'min_child_weight',
                'n_estimators', 'subsample'
        ]:
            model_str_representation += key[-3:] + '_' + str(
                model.get_params()[key])
    return prefix_location + '_'.join([
        run_cfg['target_var'], run_cfg['dataset_type'].value,
        str(outer_split_num),
        str(inner_split_num), model_str_representation,
        str(run_cfg['num_nodes']), run_cfg['param_conn_type'].value
    ]) + suffix
示例#6
0
class AutoXGB(BaseAutoML):

    params = {"random_state": RANDOM_SEED, "n_jobs": -1}

    default_xgb_space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.01),
                                       np.log(0.3)),
        "max_depth": hp.choice("num_leaves", [6, 8, 10]),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 0.9, 0.1),
        "subsample": hp.quniform("subsample", 0.5, 0.9, 0.1),
        "min_child_weight": hp.choice("min_child_weight", [10, 25, 100]),
    }

    def __init__(
        self,
        objective="reg:squarederror",
        metric="rmse",
        boosting="gbtree",
        params=params,
        space=default_xgb_space,
        n_est=500,
        n_stop=10,
        sample_size=SAMPLE_SIZE,
        feature_selection=True,
        n_fs=10,
        fs_th=1e-5,
        fs_pct=0.1,
        hyperparam_opt=True,
        n_hpopt=100,
        n_random_col=10,
        random_state=RANDOM_SEED,
        shuffle=True,
    ):

        self.metric, minimize = self._get_metric_alias_minimize(metric)

        self.params.update(params)
        self.params.update({"objective": objective, "booster": boosting})

        super(AutoXGB, self).__init__(
            params=self.params,
            space=space,
            n_est=n_est,
            n_stop=n_stop,
            sample_size=sample_size,
            feature_selection=feature_selection,
            n_fs=n_fs,
            fs_th=fs_th,
            fs_pct=fs_pct,
            hyperparam_opt=hyperparam_opt,
            n_hpopt=n_hpopt,
            minimize=minimize,
            n_random_col=n_random_col,
            random_state=random_state,
            shuffle=shuffle,
        )

    @staticmethod
    def _get_metric_alias_minimize(metric):
        """Get XGBoost metric alias.

        As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

        Args:
            metric (str): a metric name

        Returns:
            (tuple):

                - (str): the standard metric name for LightGBM
                - (bool): a flag whether to minimize or maximize the metric
        """

        assert metric in [
            "rmse",
            "rmsle",
            "mae",
            "logloss",
            "error",
            "merror",
            "mlogloss",
            "auc",
            "aucpr",
            "ndcg",
            "map",
            "poisson-nloglik",
            "gamma-nloglik",
            "cox-nloglik",
            "gamma-deviance",
            "tweedie-nloglik",
        ], "Invalid metric: {}".format(metric)

        if metric in ["auc", "aucpr", "ndcg", "map"]:
            minimize = False
        else:
            minimize = True

        return metric, minimize

    @staticmethod
    def get_feature_importance(model):
        return model.feature_importances_

    def feature_importance(self):
        return self.model.feature_importances_

    def optimize_hyperparam(self, X, y, test_size=0.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X,
                                                      y,
                                                      test_size=test_size,
                                                      shuffle=self.shuffle)

        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est,
                             **self.params,
                             **hyperparams)
            model.fit(
                X=X_trn,
                y=y_trn,
                eval_set=[(X_val, y_val)],
                eval_metric=self.metric,
                early_stopping_rounds=self.n_stop,
                verbose=False,
            )
            score = (model.evals_result()["validation_0"][self.metric][
                model.best_iteration] * self.loss_sign)

            return {"loss": score, "status": STATUS_OK, "model": model}

        trials = Trials()
        best = hyperopt.fmin(
            fn=objective,
            space=self.space,
            trials=trials,
            algo=tpe.suggest,
            max_evals=n_eval,
            verbose=1,
        )

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials

    def fit(self, X, y):
        self.model = XGBModel(n_estimators=self.n_best, **self.params)
        self.model.fit(X=X[self.features],
                       y=y,
                       eval_metric="mae",
                       verbose=False)
        return self

    def predict(self, X):
        return self.model.predict(X[self.features])
示例#7
0
class AutoXGB(BaseAutoML):

    params = {'random_state': RANDOM_SEED, 'n_jobs': -1}

    space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.01),
                                       np.log(0.3)),
        "max_depth": hp.choice("num_leaves", [6, 8, 10]),
        "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1),
        "subsample": hp.quniform("subsample", .5, .9, 0.1),
        "min_child_weight": hp.choice('min_child_weight', [10, 25, 100]),
    }

    def __init__(self,
                 objective='reg:linear',
                 metric='rmse',
                 boosting='gbtree',
                 params=params,
                 space=space,
                 n_est=500,
                 n_stop=10,
                 sample_size=SAMPLE_SIZE,
                 feature_selection=True,
                 n_fs=10,
                 fs_th=1e-5,
                 hyperparam_opt=True,
                 n_hpopt=100,
                 n_random_col=10,
                 random_state=RANDOM_SEED,
                 shuffle=True):

        self.metric, minimize = self._get_metric_alias_minimize(metric)

        self.params.update(params)
        self.params.update({'objective': objective, 'booster': boosting})

        super(AutoXGB, self).__init__(params=self.params,
                                      space=space,
                                      n_est=n_est,
                                      n_stop=n_stop,
                                      sample_size=sample_size,
                                      feature_selection=feature_selection,
                                      n_fs=n_fs,
                                      fs_th=1e-5,
                                      hyperparam_opt=hyperparam_opt,
                                      n_hpopt=n_hpopt,
                                      minimize=minimize,
                                      n_random_col=n_random_col,
                                      random_state=random_state,
                                      shuffle=shuffle)

    @staticmethod
    def _get_metric_alias_minimize(metric):
        """Get XGBoost metric alias.

        As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

        Args:
            metric (str): a metric name

        Returns:
            (tuple):

                - (str): the standard metric name for LightGBM
                - (bool): a flag whether to minimize or maximize the metric
        """

        assert metric in [
            'rmse', 'rmsle', 'mae', 'logloss', 'error', 'merror', 'mlogloss',
            'auc', 'aucpr', 'ndcg', 'map', 'poisson-nloglik', 'gamma-nloglik',
            'cox-nloglik', 'gamma-deviance', 'tweedie-nloglik'
        ], 'Invalid metric: {}'.format(metric)

        if metric in ['auc', 'aucpr', 'ndcg', 'map']:
            minimize = False
        else:
            minimize = True

        return metric, minimize

    @staticmethod
    def get_feature_importance(model):
        return model.feature_importances_

    def feature_importance(self):
        return self.model.feature_importances_

    def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X,
                                                      y,
                                                      test_size=test_size,
                                                      shuffle=self.shuffle)

        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est,
                             **self.params,
                             **hyperparams)
            model.fit(X=X_trn,
                      y=y_trn,
                      eval_set=[(X_val, y_val)],
                      eval_metric=self.metric,
                      early_stopping_rounds=self.n_stop,
                      verbose=False)
            score = model.evals_result()['validation_0'][self.metric][
                model.best_iteration] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective,
                             space=self.space,
                             trials=trials,
                             algo=tpe.suggest,
                             max_evals=n_eval,
                             verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials

    def fit(self, X, y):
        self.model = XGBModel(n_estimators=self.n_best, **self.params)
        self.model.fit(X=X[self.features],
                       y=y,
                       eval_metric='mae',
                       verbose=False)
        return self

    def predict(self, X):
        return self.model.predict(X[self.features])
示例#8
0
 def build_model(self):
     model = XGBModel(**self.model_params)
     return model
def fit_xgb_model(out_fold_num: int, in_fold_num: int, run_cfg: Dict[str, Any],
                  model: XGBModel, X_train_in: FlattenCorrsDataset,
                  X_val_in: FlattenCorrsDataset) -> Dict:
    model_saving_path = create_name_for_xgbmodel(model=model,
                                                 outer_split_num=out_fold_num,
                                                 inner_split_num=in_fold_num,
                                                 run_cfg=run_cfg)

    train_arr = np.array([data.x.numpy() for data in X_train_in])
    val_arr = np.array([data.x.numpy() for data in X_val_in])

    if run_cfg['target_var'] == 'gender':
        y_train = [int(data.sex.item()) for data in X_train_in]
        y_val = [int(data.sex.item()) for data in X_val_in]
    elif run_cfg['target_var'] == 'age':
        # np.array() because of printing calls in the regressor_metrics function
        y_train = np.array([float(data.age.item()) for data in X_train_in])
        y_val = np.array([float(data.age.item()) for data in X_val_in])

    model.fit(train_arr, y_train, callbacks=[wandb.xgboost.wandb_callback()])

    pickle.dump(model, open(model_saving_path, "wb"))

    if run_cfg['target_var'] == 'gender':
        train_metrics = return_classifier_metrics(
            y_train,
            pred_prob=model.predict_proba(train_arr)[:, 1],
            pred_binary=model.predict(train_arr),
            flatten_approach=True)
        val_metrics = return_classifier_metrics(
            y_val,
            pred_prob=model.predict_proba(val_arr)[:, 1],
            pred_binary=model.predict(val_arr),
            flatten_approach=True)

        print(
            '{:1d}-{:1d}: Auc: {:.4f} / {:.4f}, Acc: {:.4f} / {:.4f}, F1: {:.4f} /'
            ' {:.4f} '.format(out_fold_num, in_fold_num, train_metrics['auc'],
                              val_metrics['auc'], train_metrics['acc'],
                              val_metrics['acc'], train_metrics['f1'],
                              val_metrics['f1']))
        wandb.log({
            f'train_auc{in_fold_num}': train_metrics['auc'],
            f'val_auc{in_fold_num}': val_metrics['auc'],
            f'train_acc{in_fold_num}': train_metrics['acc'],
            f'val_acc{in_fold_num}': val_metrics['acc'],
            f'train_sens{in_fold_num}': train_metrics['sensitivity'],
            f'val_sens{in_fold_num}': val_metrics['sensitivity'],
            f'train_spec{in_fold_num}': train_metrics['specificity'],
            f'val_spec{in_fold_num}': val_metrics['specificity'],
            f'train_f1{in_fold_num}': train_metrics['f1'],
            f'val_f1{in_fold_num}': val_metrics['f1']
        })
    else:
        train_metrics = return_regressor_metrics(
            y_train, pred_prob=model.predict(train_arr))
        val_metrics = return_regressor_metrics(
            y_val, pred_prob=model.predict(val_arr))

        print('{:1d}-{:1d}: R2: {:.4f} / {:.4f}, R: {:.4f} / {:.4f}'.format(
            out_fold_num, in_fold_num, train_metrics['r2'], val_metrics['r2'],
            train_metrics['r'], val_metrics['r']))
        wandb.log({
            f'train_r2{in_fold_num}': train_metrics['r2'],
            f'val_r2{in_fold_num}': val_metrics['r2'],
            f'train_r{in_fold_num}': train_metrics['r'],
            f'val_r{in_fold_num}': val_metrics['r']
        })

    return val_metrics
示例#10
0
def to_mls(xgboost_model: xgboost.XGBModel, **kwargs):
    params = xgboost_model.get_params()

    def standardize_types(v):
        if isinstance(v, np.ndarray):
            return [normalize_float(x) for x in v.tolist()]
        elif isinstance(v, float):
            return normalize_float(v)
        elif callable(v):
            return str(v)  # TODO
        return v

    def deep_get_params(params):
        if isinstance(params, (list, tuple)):
            return [deep_get_params(x) for x in params]
        elif isinstance(params, dict):
            return {k: deep_get_params(v) for k, v in params.items()}
        else:
            v = standardize_types(params)
            try:
                p = v.get_params()
                t = type(v).__module__ + "." + type(v).__name__
                return {"@value": {"type": t, "params": deep_get_params(p)}}
            except AttributeError:
                try:
                    json.dumps(v)
                    return v
                except TypeError as e:
                    raise NotImplementedError(
                        "can't convert sklearn model of type {} to mls: {}".format(
                            type(xgboost_model), e
                        )
                    )

    params = deep_get_params(params)
    model_hash = xgboost_model.__hash__()
    model_class = "{}.{}".format(
        type(xgboost_model).__module__, type(xgboost_model).__name__
    )
    algo = Algorithm(_id=model_class)

    implementation = Implementation(
        _id=generate_unique_id("http://www.w3.org/ns/mls#Implementation"),
        parameters=[
            HyperParameter(key, model_hash=model_hash) for key in params.keys()
        ],
        implements=algo,
        version=xgboost.__version__,
    )

    input_values = [
        HyperParameterSetting(
            value=val,
            specified_by=HyperParameter(key, model_hash=model_hash),
            model_hash=model_hash,
        )
        for key, val in params.items()
        if val is not None
    ]

    output_values = []
    if EVALUATION_MEASURE_KEY in kwargs:
        eval_measure = kwargs[EVALUATION_MEASURE_KEY]
        output_values.append(evaluation_measure(eval_measure[0], eval_measure[1]))
    model = Run(model_hash, implementation, input_values, output_values, algo)
    return RunSchema().dumps(model)