예제 #1
0
    def _train_sklgb(self, data):
        """Train the model using sklearn lightgbm API.

        Args:
            data ((tuple of DataFrame)): training and testing data.

        Returns:
            model_result (dict): result of trained model.
                - model (:obj:): trained model,
                - importances (DataFrame): feature importance table,
                - eval_score (float): score of model using `eval_func`.
        """

        logging.info("using lightgbm sklearn api...")

        X_train, X_test, y_train, y_test = data
        X_train, X_test = X_train[self.features], X_test[self.features]

        if self.task == "regression":
            model = LGBMRegressor(**self.trn_params, n_jobs=-1)
        elif self.task == "classification":
            model = LGBMClassifier(**self.trn_params, n_jobs=-1)

        # Train model
        model.fit(X_train, y_train,
                  feature_name=self.features,
                  categorical_feature=self.cat_features,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  early_stopping_rounds=self.early_stopping_rounds,
                  verbose=self.verbose_eval)

        # Calculate the feature importances
        importances = pd.DataFrame()
        importances["feature"] = X_train.columns.values.tolist()
        importances["importance"] = model.feature_importances_

        # Calculate eval_func function eval_score
        if self.eval_func:
            if self.task == "classification":
                # Return the predicted probability
                # for each class for each sample.
                y_test_pred = model.predict_proba(X_test)
            else:
                y_test_pred = model.predict(X_test)

            eval_score = self.eval_func(y_test, y_test_pred)
            logging.info("score on validation is %f" % eval_score)
        else:
            eval_score = None

        model_result = {
            "model": model,
            "importances": importances,
            "eval_score": eval_score
        }
        return model_result
예제 #2
0
class Stacking(BaseEnsembleModel):
    def __init__(self,
                 stats,
                 data_node,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='lightgbm',
                 kfold=5):
        super().__init__(stats=stats,
                         data_node=data_node,
                         ensemble_method='stacking',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)

        self.kfold = kfold
        try:
            from lightgbm import LGBMClassifier
        except:
            warnings.warn(
                "Lightgbm is not imported! Stacking will use linear model instead!"
            )
            meta_learner = 'linear'

        self.meta_method = meta_learner

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMClassifier
                self.meta_learner = LGBMClassifier(max_depth=4,
                                                   learning_rate=0.05,
                                                   n_estimators=150,
                                                   n_jobs=1)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMRegressor
                self.meta_learner = LGBMRegressor(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=70,
                                                  n_jobs=1)

    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model, _ = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list, mode='train')

                X, y = _node.data
                if self.base_model_mask[model_cnt] == 1:
                    for j, (train, test) in enumerate(kf.split(X, y)):
                        x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[
                            test]
                        estimator = fetch_predict_estimator(
                            self.task_type,
                            algo_id,
                            config,
                            x_p1,
                            y_p1,
                            weight_balance=data.enable_balance,
                            data_balance=data.data_balance)
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-model%d_part%d' %
                                    (self.timestamp, model_cnt, j)),
                                'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[test,
                                           suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[test, suc_cnt *
                                           n_dim:(suc_cnt + 1) * n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self

    def get_feature(self, data):
        # Predict the labels via stacking
        feature_p2 = None
        model_cnt = 0
        suc_cnt = 0
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model, _ = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list)

                if self.base_model_mask[model_cnt] == 1:
                    for j in range(self.kfold):
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-model%d_part%d' %
                                    (self.timestamp, model_cnt, j)),
                                'rb') as f:
                            estimator = pkl.load(f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(_node.data[0])
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                n_dim = 1
                            if feature_p2 is None:
                                num_samples = len(_node.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            # Get average predictions
                            if n_dim == 1:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:,
                                                                                           1:2] / self.kfold
                            else:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                        else:
                            pred = estimator.predict(_node.data[0]).reshape(
                                -1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(_node.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            # Get average predictions
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                    suc_cnt += 1
                model_cnt += 1
        return feature_p2

    def predict(self, data):
        feature_p2 = self.get_feature(data)
        # Get predictions from meta-learner
        if self.task_type in CLS_TASKS:
            final_pred = self.meta_learner.predict_proba(feature_p2)
        else:
            final_pred = self.meta_learner.predict(feature_p2)
        return final_pred

    def get_ens_model_info(self):
        model_cnt = 0
        ens_info = {}
        ens_config = []
        for algo_id in self.stats:
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, _) in enumerate(model_to_eval):
                if not hasattr(self, 'base_model_mask'
                               ) or self.base_model_mask[model_cnt] == 1:
                    model_path = os.path.join(
                        self.output_dir,
                        '%s-stacking-model%d' % (self.timestamp, model_cnt))
                    ens_config.append((algo_id, config, model_path))
                model_cnt += 1
        ens_info['ensemble_method'] = 'stacking'
        ens_info['config'] = ens_config
        ens_info['meta_learner'] = self.meta_method
        return ens_info
예제 #3
0
class LightGBM(BaseRegressionModel):
    def __init__(self, n_estimators, learning_rate, num_leaves,
                 min_child_weight, subsample, colsample_bytree, reg_alpha,
                 reg_lambda, random_state):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.num_leaves = num_leaves
        self.subsample = subsample
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.min_child_weight = min_child_weight
        self.colsample_bytree = colsample_bytree

        self.n_jobs = -1
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        self.estimator = LGBMRegressor(num_leaves=self.num_leaves,
                                       learning_rate=self.learning_rate,
                                       n_estimators=self.n_estimators,
                                       min_child_weight=self.min_child_weight,
                                       subsample=self.subsample,
                                       colsample_bytree=self.colsample_bytree,
                                       reg_alpha=self.reg_alpha,
                                       reg_lambda=self.reg_lambda,
                                       random_state=self.random_state,
                                       n_jobs=self.n_jobs)
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'LightGBM Regressor',
            'name': 'LightGBM Regressor',
            'handles_regression': True,
            'handles_classification': False,
            'handles_multiclass': False,
            'handles_multilabel': False,
            'is_deterministic': False,
            'input': (SPARSE, DENSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()
            n_estimators = UniformIntegerHyperparameter("n_estimators",
                                                        100,
                                                        1000,
                                                        default_value=500)
            num_leaves = UniformIntegerHyperparameter("num_leaves",
                                                      31,
                                                      1023,
                                                      default_value=31)
            learning_rate = UniformFloatHyperparameter("learning_rate",
                                                       0.025,
                                                       0.3,
                                                       default_value=0.1,
                                                       log=True)
            min_child_weight = UniformIntegerHyperparameter("min_child_weight",
                                                            1,
                                                            10,
                                                            default_value=1)
            subsample = UniformFloatHyperparameter("subsample",
                                                   0.5,
                                                   1,
                                                   default_value=1)
            colsample_bytree = UniformFloatHyperparameter("colsample_bytree",
                                                          0.5,
                                                          1,
                                                          default_value=1)
            reg_alpha = UniformFloatHyperparameter('reg_alpha',
                                                   1e-10,
                                                   10,
                                                   log=True,
                                                   default_value=1e-10)
            reg_lambda = UniformFloatHyperparameter("reg_lambda",
                                                    1e-10,
                                                    10,
                                                    log=True,
                                                    default_value=1e-10)
            cs.add_hyperparameters([
                n_estimators, num_leaves, learning_rate, min_child_weight,
                subsample, colsample_bytree, reg_alpha, reg_lambda
            ])
            return cs
        elif optimizer == 'tpe':
            from hyperopt import hp
            space = {
                'n_estimators':
                hp.randint('lgb_n_estimators', 901) + 100,
                'num_leaves':
                hp.randint('lgb_num_leaves', 993) + 31,
                'learning_rate':
                hp.loguniform('lgb_learning_rate', np.log(0.025), np.log(0.3)),
                'min_child_weight':
                hp.randint('lgb_min_child_weight', 10) + 1,
                'subsample':
                hp.uniform('lgb_subsample', 0.5, 1),
                'colsample_bytree':
                hp.uniform('lgb_colsample_bytree', 0.5, 1),
                'reg_alpha':
                hp.loguniform('lgb_reg_alpha', np.log(1e-10), np.log(10)),
                'reg_lambda':
                hp.loguniform('lgb_reg_lambda', np.log(1e-10), np.log(10))
            }

            init_trial = {
                'n_estimators': 500,
                'num_leaves': 31,
                'learning_rate': 0.1,
                'min_child_weight': 1,
                'subsample': 1,
                'colsample_bytree': 1,
                'reg_alpha': 1e-10,
                'reg_lambda': 1e-10
            }

            return space
    yt = y_train.loc[Xt.index, "target"]
    Xt = Xt.drop(drop_cols, axis=1)

    Xv = X_train[X_train.codmes == mes]
    yv = y_train.loc[Xv.index, "target"]

    learner = LGBMClassifier(n_estimators=5000)
    learner.fit(Xt,
                yt,
                early_stopping_rounds=50,
                eval_metric="auc",
                eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)],
                verbose=50)
    gc.collect()
    test_probs.append(
        pd.Series(learner.predict_proba(X_test.drop(drop_cols, axis=1))[:, -1],
                  index=X_test.index,
                  name="fold_" + str(mes)))
    train_probs.append(
        pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1],
                  index=Xv.index,
                  name="probs"))
    gc.collect()

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)

test = pd.concat([test_probs.rename("probs"),
                  test_preds.rename("preds")],
                 axis=1)
train = pd.concat([train_probs.rename("probs"),
예제 #5
0
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['Next_Premium'].notnull()]
    test_df = df[df['Next_Premium'].isnull()]
    
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results

    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['Policy_Number','Next_Premium',"Insured's_ID",
 'Prior_Policy_Number',
 'Cancellation',
 'Vehicle_identifier_number',
 'Vehicle_Make_and_Model1',
 'Vehicle_Make_and_Model2',
 'Imported_or_Domestic_Car',
 'Coding_of_Vehicle_Branding_&_Type',
 'qpt',
 'fpt',
 'Main_Insurance_Coverage_Group',
 'Insurance_Coverage',
 'Distribution_Channel',
 'pdmg_acc',
 'fassured',
 'ibirth',
 'fsex',
 'fmarriage',
 'aassured_zip',
 'iply_area',
 'dbirth',
 'fequipment1',
 'fequipment2',
 'fequipment3',
 'fequipment4',
 'fequipment5',
 'fequipment6',
 'fequipment9',
 'nequipment9',
 'Claim_Number',
 'Nature_of_the_claim',
 "Driver's_Gender",
 "Driver's_Relationship_with_Insured",
 'DOB_of_Driver',
 'Marital_Status_of_Driver',
 'Accident_Date',
 'Cause_of_Loss',
 'Coverage',
 'Vehicle_identifier_claim',
 'Claim_Status_(close,_open,_reopen_etc)',
 'Accident_area',
 'number_of_claimants',
 'Accident_Time']]

    seed = 7
    test_size = 0.3
    n_fold = 1
    submission_file_name = "submission_kernel01.csv"
    submission_file_name_agg = "submission_kernel_agg.csv"
    train_x, valid_x, train_y, valid_y = cross_validation.train_test_split(train_df[feats], train_df['Next_Premium'], test_size=test_size, random_state=seed)


    clf = LGBMRegressor(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40,
        silent=-1,
        verbose=-1)

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric= 'mae', verbose= 100, early_stopping_rounds= 200)


    oof_preds = clf.predict(valid_x, num_iteration=clf.best_iteration_)
    sub_preds = clf.predict(test_df[feats], num_iteration=clf.best_iteration_)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, mean_absolute_error(valid_y, oof_preds)))
    test_df['Next_Premium'] = sub_preds
    #est_df[['Policy_Number', 'Next_Premium']].to_csv(submission_file_name, index= False)
    test_df.to_csv(submission_file_name)


    NP_aggregations = {'Next_Premium': ['mean']}
    test_df_agg = test_df.groupby('Policy_Number').agg(NP_aggregations)


    
    test_df_submit = pd.read_csv('../data/testing-set.csv')
    test_df_submit = test_df_submit.join(test_df_agg, how='left', on='Policy_Number',  rsuffix='_agg')
    test_df_submit.to_csv(submission_file_name_agg, index= False)

    

    
    del clf, train_x, train_y, valid_x, valid_y
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['Next_Premium'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1)

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df
예제 #6
0
파일: lgbm.py 프로젝트: ozanzgur/AIRepo
    def fit(self, x):
        """ X can be pd.Dataframe, np.ndarray or sparse.
        y has to be pd.series
        """
        train_data = x['train_data']
        val_data = x['val_data']

        self.models = []

        # For CV
        oof_preds = np.zeros(len(train_data[0]))
        X_data = train_data[0]
        y_data = train_data[1]

        # Validate after CV
        X_val = val_data[0]
        try:
            y_val = np.array(val_data[1].todense()).ravel()
        except:
            y_val = np.array(val_data[1]).ravel()

        is_sparse = scipy.sparse.issparse(X_data)

        # Create dataframe to keep feature importances for each fold
        feature_importances = pd.DataFrame()
        if not is_sparse:
            self.features = X_data.columns

        if self.features is not None:
            if not len(self.features) == X_data.shape[1]:
                raise ValueError(
                    'Number of features must be the same as n_columns in X.')

            # Create column for features
            feature_importances['feature'] = self.features

        cv_metrics = list()

        n_folds = 0
        folds = None
        val_preds = None

        if not isinstance(self.folds, list):
            folds = self.folds.split(X_data, y_data)
        else:
            folds = self.folds

        for i_fold, (trn_idx, val_idx) in enumerate(folds):
            n_folds += 1
            X_trn_fold = X_data[trn_idx] if is_sparse else X_data.iloc[trn_idx]
            X_val_fold = X_data[val_idx] if is_sparse else X_data.iloc[val_idx]

            y_val_fold = None
            y_trn_fold = None
            if isinstance(y_data, pd.Series):
                y_trn_fold = y_data.iloc[trn_idx]
                y_val_fold = y_data.iloc[val_idx]
            else:
                y_trn_fold = y_data[trn_idx]
                y_val_fold = y_data[val_idx]
                try:
                    y_trn_fold = np.array(y_trn_fold.todense()).ravel()
                    y_val_fold = np.array(y_val_fold.todense()).ravel()
                except:
                    y_trn_fold = np.array(y_trn_fold).ravel()
                    y_val_fold = np.array(y_val_fold).ravel()

            logger.info('Training on fold {}'.format(i_fold))
            """trn_data = lgb.Dataset(X_trn_fold, label = y_trn_fold)
            val_data = lgb.Dataset(X_val_fold, label = y_val_fold)"""
            # This is validation in CV, not validation set

            # Training for this fold
            #print(self.lgbm_hparams)
            clf = LGBMRegressor(
                **self.lgbm_hparams
            ) if self.objective == 'regression' else LGBMClassifier(
                **self.lgbm_hparams)
            clf = clf.fit(X=X_trn_fold,
                          y=y_trn_fold,
                          eval_set=[(X_trn_fold, y_trn_fold),
                                    (X_val_fold, y_val_fold)],
                          early_stopping_rounds=250,
                          verbose=200)

            # Keep models of each fold
            self.models.append(clf)

            feature_importances['fold_{}'.format(
                i_fold)] = clf.feature_importances_

            try:
                oof_preds[val_idx] = clf.predict_proba(X_val_fold)
            except:
                oof_preds[val_idx] = clf.predict(X_val_fold)

            # Validation for this fold
            if X_val is not None:
                if val_preds is None:
                    try:
                        val_preds = clf.predict_proba(X_val)
                    except:
                        val_preds = clf.predict(X_val)
                else:
                    try:
                        val_preds += clf.predict_proba(X_val)
                    except:
                        val_preds += clf.predict(X_val)

        logger.info('Training has finished.')
        #logger.info(f'Mean CV {params["metric"]}: {np.mean(cv_metrics)}')

        # Validation
        val_metric = None
        if X_val is not None:
            val_preds /= n_folds

            logger.info('Calculating validation metric...')
            val_metric = self.get_metric(y_val, val_preds)

            logger.info(f'Validation {self.metric}: {val_metric}')

        feature_importances['importance'] = \
            feature_importances[[f'fold_{i}' for i in range(n_folds)]].sum(axis = 1)

        cols_to_keep = [
            col for col in feature_importances.columns if 'fold' not in col
        ]
        self.feature_importances = feature_importances[cols_to_keep]

        if 'feature' in self.feature_importances.columns:
            self.feature_importances.sort_values(by='importance',
                                                 ascending=False,
                                                 inplace=True)
        return {
            #'cv_metrics': cv_metrics,
            'feature_importances': feature_importances,
            'val_preds': val_preds,
            'oof_preds': oof_preds,
            'metric': val_metric
        }
예제 #7
0
class Blending(BaseEnsembleModel):
    def __init__(self, stats,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='lightgbm'):
        super().__init__(stats=stats,
                         ensemble_method='blending',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)
        try:
            from lightgbm import LGBMClassifier
        except:
            warnings.warn("Lightgbm is not imported! Blending will use linear model instead!")
            meta_learner = 'linear'
        self.meta_method = meta_learner
        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(learning_rate=0.05, subsample=0.7, max_depth=4,
                                                               n_estimators=250)
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMClassifier
                self.meta_learner = LGBMClassifier(max_depth=4, learning_rate=0.05, n_estimators=150)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMRegressor
                self.meta_learner = LGBMRegressor(max_depth=4, learning_rate=0.05, n_estimators=70)

    def fit(self, data):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              stratify=data.data[1], random_state=self.seed)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              random_state=self.seed)

                if self.base_model_mask[model_cnt] == 1:
                    estimator = fetch_predict_estimator(self.task_type, config, x_p1, y_p1,
                                                        weight_balance=node.enable_balance,
                                                        data_balance=node.data_balance
                                                        )
                    with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)),
                              'wb') as f:
                        pkl.dump(estimator, f)
                    if self.task_type in CLS_TASKS:
                        pred = estimator.predict_proba(x_p2)
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2]
                        else:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    else:
                        pred = estimator.predict(x_p2).reshape(-1, 1)
                        n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self

    def get_feature(self, data, solvers):
        # Predict the labels via blending
        feature_p2 = None
        model_cnt = 0
        suc_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                test_node = solvers[algo_id].optimizer['fe'].apply(data, node)
                if self.base_model_mask[model_cnt] == 1:
                    with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)),
                              'rb') as f:
                        estimator = pkl.load(f)
                    if self.task_type in CLS_TASKS:
                        pred = estimator.predict_proba(test_node.data[0])
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(data.data[0])
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2]
                        else:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    else:
                        pred = estimator.predict(test_node.data[0]).reshape(-1, 1)
                        n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(data.data[0])
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1

        return feature_p2

    def predict(self, data, solvers):
        feature_p2 = self.get_feature(data, solvers)
        # Get predictions from meta-learner
        if self.task_type in CLS_TASKS:
            final_pred = self.meta_learner.predict_proba(feature_p2)
        else:
            final_pred = self.meta_learner.predict(feature_p2)
        return final_pred

    def get_ens_model_info(self):
        model_cnt = 0
        ens_info = {}
        ens_config = []
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                if not hasattr(self, 'base_model_mask') or self.base_model_mask[model_cnt] == 1:
                    model_path = os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt))
                    ens_config.append((algo_id, node.config, config, model_path))
                model_cnt += 1
        ens_info['ensemble_method'] = 'blending'
        ens_info['config'] = ens_config
        ens_info['meta_learner'] = self.meta_method
        ens_info['meta_model']=self.meta_learner
        return ens_info