Exemplo n.º 1
0
    def xgb_evaluate(self, **Tunparams):
        warnings.filterwarnings("ignore")

        Tunparams['max_depth'] = int(Tunparams['max_depth'])
        Tunparams['min_child_weight'] = int(Tunparams['min_child_weight'])
        Tunparams['cosample_bytree'] = max(
            min(Tunparams['colsample_bytree'], 1), 0)
        Tunparams['subsample'] = max(min(Tunparams['subsample'], 1), 0)
        Tunparams['gamma'] = max(Tunparams['gamma'], 0)
        Tunparams['alpha'] = max(Tunparams['alpha'], 0)
        folds = data_prepare.get_folds(df=self.x_train[['totals.pageviews'
                                                        ]].reset_index(),
                                       n_splits=2)

        if 'fullVisitorId' in self.x_train.columns:
            self.x_train.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.x_test.columns:
            self.x_test.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.y_train.columns:
            self.y_train.drop('fullVisitorId', axis=1, inplace=True)

        oof_preds = np.zeros(self.x_train.shape[0])

        for n_fold, (train_idx, valid_idx) in enumerate(folds):
            train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[
                train_idx]
            valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[
                valid_idx]

            # LGBMRegressor parameters found by Bayesian optimization
            self.clf.fit(train_x,
                         np.log1p(train_y),
                         eval_set=[(valid_x, np.log1p(valid_y))],
                         eval_metric='rmse',
                         verbose=100,
                         early_stopping_rounds=200)

            oof_preds[valid_idx] = self.clf.predict(
                valid_x, ntree_limit=self.clf.best_iteration)

            #remove negative and transform un log
            oof_preds[oof_preds < 0] = 0
            del train_x, train_y, valid_x, valid_y
            gc.collect()
        return -mean_squared_error(np.log1p(self.y_train), oof_preds)**.5
Exemplo n.º 2
0
    def cv(self, nfolds=5, submission=True):
        self.regressors.clear()
        self.feature_importance_df = pd.DataFrame()

        if not submission:
            folds = data_prepare.get_folds(df=self.x_train, n_splits=nfolds)
        else:
            folds = data_prepare.get_folds(
                df=self.x_train[['totals.pageviews']].reset_index(),
                n_splits=nfolds)

        if 'fullVisitorId' in self.x_train.columns:
            self.x_train.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.x_test.columns:
            self.x_test.drop('fullVisitorId', axis=1, inplace=True)
        #if 'fullVisitorId' in self.y_train.columns:
        #self.y_train.drop('fullVisitorId', axis=1, inplace=True)

        oof_preds = np.zeros(self.x_train.shape[0])
        preds_test = np.empty((nfolds, self.x_test.shape[0]))

        self.logfile.write('param: {}\n'.format(self.param))
        self.logfile.write('fold: {}\n'.format(nfolds))
        self.logfile.write('data shape: {}\n'.format(self.x_train.shape))
        self.logfile.write('features: {}\n'.format(
            self.x_train.columns.tolist()))

        if self.comment is not None:
            self.logfile.write('comment: {}\n'.format(self.comment))

        self.logfile.write('output: ../output/{}.csv\n'.format(self.name))
        self.logfile.flush()

        for n_fold, (train_idx, valid_idx) in enumerate(folds):
            fstart = time.time()
            train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[
                train_idx]
            valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[
                valid_idx]

            # lgbRegressor parameters found by Bayesian optimization
            clf = LGBMRegressor(**self.param)
            clf.fit(train_x,
                    np.log1p(train_y),
                    eval_set=[(valid_x, np.log1p(valid_y))],
                    eval_metric='rmse',
                    verbose=100,
                    early_stopping_rounds=200)

            oof_preds[valid_idx] = clf.predict(
                valid_x, num_iteration=clf.best_iteration_)
            preds_test[n_fold, :] = clf.predict(
                self.x_test, num_iteration=clf.best_iteration_)

            #remove negative and transform un log
            oof_preds[oof_preds < 0] = 0
            preds_test[preds_test < 0] = 0

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = self.x_train.columns.tolist()
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            self.feature_importance_df = pd.concat(
                [self.feature_importance_df, fold_importance_df], axis=0)

            strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format(
                str(datetime.now()),
                time.time() - fstart, n_fold + 1,
                mean_squared_error(np.log1p(valid_y),
                                   oof_preds[valid_idx])**.5)
            print(strlog)
            self.logfile.write(strlog + '\n')
            self.logfile.flush()

            self.regressors.append(clf)
            del clf, train_x, train_y, valid_x, valid_y
            gc.collect()

        full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5
        strlog = 'Full RMSE score {:.6f}'.format(full_rmse)
        print(strlog)
        self.logfile.write(strlog + '\n')

        preds = preds_test.mean(axis=0)

        if submission:
            #sub = pd.read_csv('../input/sample_submission.csv')
            #sub['PredictedLogRevenue'] = preds
            preds.to_csv('../output/submission/{}.csv'.format(self.name),
                         index=True)

            cols = self.feature_importance_df[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:100].index
            self.logfile.write('top features:\n')
            for c in cols:
                self.logfile.write('{}\n'.format(c))

            self.logfile.flush()

            self.display_importances(self.feature_importance_df, self.name)

        # for stack
        np.save('../output/feats/{}_trn_prd_feats'.format(self.name),
                oof_preds)
        np.save('../output/feats/{}_tes_prd_feats'.format(self.name), preds)
        return self.feature_importance_df, full_rmse, oof_preds, preds
Exemplo n.º 3
0
    def __init__(self,
                 name,
                 clf,
                 comment=None,
                 remove_columns=None,
                 param=None,
                 lm_seed=None,
                 log=None,
                 predict_feats=False,
                 debug=True,
                 BASE_X_PATH=None,
                 TRN_PRED_FEATS=None,
                 TES_PRED_FEATS=None,
                 nfolds=5,
                 scale=False):

        self.name = name
        self.comment = comment

        if log is None:
            self.logfile = open('../output/log/{}.txt'.format(name), 'w')
        else:
            self.logfile = open('../output/log/{}.txt'.format(log), 'w')

        if param is None:
            self.param = {
                'alpha': 0.5,
            }
        else:
            self.param = param

        if lm_seed is not None:
            self.param['random_state'] = lm_seed
        self.feature_importance_df = None
        self.Tunparams = None
        if BASE_X_PATH is not None:
            self.x = feather.read_dataframe(BASE_X_PATH)
        if TRN_PRED_FEATS is not None:
            self.trn_preds_feats = np.load(TRN_PRED_FEATS)
        if TES_PRED_FEATS is not None:
            self.tes_preds_feats = np.load(TES_PRED_FEATS)

        if remove_columns is not None:
            drop_features = [
                _f for _f in self.x.columns if _f in remove_columns
            ]
            self.x.drop(drop_features, axis=1, inplace=True)
            del drop_features
            gc.collect()

        self.clf = clf(**self.param)

        #read & prepare datasets
        print('read & prepare datasets shape: {}'.format(self.x.shape))

        #split train & test sets
        self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split(
            self.x)

        #debug
        if debug:
            x_train_s = self.x_train.sample(frac=0.3)
            x_test_s = self.x_test.sample(frac=0.3)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]
        else:
            x_train_s = self.x_train.sample(frac=1)
            x_test_s = self.x_test.sample(frac=1)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]

        if predict_feats:
            self.x_train, trn_feats = add_pred_feats(x_train_s,
                                                     self.trn_preds_feats,
                                                     None)
            self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats,
                                            trn_feats)
            self.y_train = y_train_s.groupby('fullVisitorId').sum()
            self.x_train.fillna(0, inplace=True)
            self.x_test.fillna(0, inplace=True)
            self.x_train.replace(np.inf, 99999, inplace=True)
            self.x_train.replace(-np.inf, -99999, inplace=True)
            self.x_test.replace(-np.inf, -99999, inplace=True)
            self.x_test.replace(np.inf, 99999, inplace=True)
            del x_train_s, x_test_s, y_train_s
            gc.collect()
        else:
            self.x_train = x_train_s.groupby('fullVisitorId').mean()
            self.x_test = x_test_s.groupby('fullVisitorId').mean()
            self.y_train = y_train_s.groupby('fullVisitorId').sum()

            self.x_train.fillna(0, inplace=True)
            self.x_test.fillna(0, inplace=True)
            self.x_train.replace(np.inf, 99999, inplace=True)
            self.x_train.replace(-np.inf, -99999, inplace=True)
            self.x_test.replace(-np.inf, -99999, inplace=True)
            self.x_test.replace(np.inf, 99999, inplace=True)

        self.folds = data_prepare.get_folds(
            df=self.x_train[['totals.pageviews']].reset_index(),
            n_splits=nfolds)

        if scale:
            print("scale...")
            scaler = MinMaxScaler(feature_range=(0, 1))
            self.x_train[self.x_train.columns] = scaler.fit_transform(
                self.x_train)
            self.x_test[self.x_test.columns] = scaler.fit_transform(
                self.x_test)
Exemplo n.º 4
0
    def cv(self, nfolds=5, submission=True, tune=True, nthread=16):
        self.feature_importance_df = pd.DataFrame()
        self.Tunparams = {
            'colsample_bytree': (0.5, 1),
            'learning_rate': (.01, .05),
            'subsample': (0.8, 1),
            'max_depth': (1, 10),
            'alpha': (.02, .10),
            'lambda': (.02, .08),
            'gamma': (0, 10),
            'min_child_weight': (1, 40)
        }

        folds = data_prepare.get_folds(df=self.x_train[['totals.pageviews'
                                                        ]].reset_index(),
                                       n_splits=nfolds)

        if 'fullVisitorId' in self.x_train.columns:
            self.x_train.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.x_test.columns:
            self.x_test.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.y_train.columns:
            self.y_train.drop('fullVisitorId', axis=1, inplace=True)

        print('y_train columns:', self.y_train.columns)
        print('y_train shape drop id?:', self.y_train.shape)
        np.where((self.x_test.applymap(type) == object))

        oof_preds = np.zeros(self.x_train.shape[0])
        preds_test = np.empty((nfolds, self.x_test.shape[0]))

        #Tune parameters
        if tune:
            bo = BayesianOptimization(self.xgb_evaluate, self.Tunparams)
            bo.maximize(init_points=5, n_iter=50, acq='rnd')

            best_params = bo.res['max']['max_params']
            best_params['num_leaves'] = int(best_params['num_leaves'])
            best_params['max_depth'] = int(best_params['max_depth'])
            self.logfile.write('best param process: {}\n'.format(
                bo.res['max']['max_val']))
            self.param.update(best_params)
            self.logfile.write('best param: {}\n'.format(self.param))

        self.logfile.write('param: {}\n'.format(self.param))
        self.logfile.write('fold: {}\n'.format(nfolds))
        self.logfile.write('data shape: {}\n'.format(self.x_train.shape))
        self.logfile.write('features: {}\n'.format(
            self.x_train.columns.tolist()))

        if self.comment is not None:
            self.logfile.write('comment: {}\n'.format(self.comment))

        self.logfile.write('output: ../output/{}.csv\n'.format(self.name))
        self.logfile.flush()

        for n_fold, (train_idx, valid_idx) in enumerate(folds):
            fstart = time.time()
            train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[
                train_idx]
            valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[
                valid_idx]

            # XGBRegressor parameters found by Bayesian optimization
            self.clf.fit(train_x,
                         np.log1p(train_y),
                         eval_set=[(valid_x, np.log1p(valid_y))],
                         eval_metric='rmse',
                         verbose=100,
                         early_stopping_rounds=200)

            oof_preds[valid_idx] = self.clf.predict(
                valid_x, ntree_limit=self.clf.best_iteration)
            preds_test[n_fold, :] = self.clf.predict(
                self.x_test[self.x_train.columns],
                ntree_limit=self.clf.best_iteration)

            #remove negative and transform un log
            oof_preds[oof_preds < 0] = 0
            preds_test[preds_test < 0] = 0

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = self.x_train.columns.tolist()
            fold_importance_df["importance"] = self.clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            self.feature_importance_df = pd.concat(
                [self.feature_importance_df, fold_importance_df], axis=0)

            strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format(
                str(datetime.now()),
                time.time() - fstart, n_fold + 1,
                mean_squared_error(np.log1p(valid_y),
                                   oof_preds[valid_idx])**.5)
            print(strlog)
            self.logfile.write(strlog + '\n')
            self.logfile.flush()

            del train_x, train_y, valid_x, valid_y
            gc.collect()

        full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5
        strlog = 'Full RMSE score {:.6f}'.format(full_rmse)
        print(strlog)
        self.logfile.write(strlog + '\n')

        preds = preds_test.mean(axis=0)
        if submission:
            #sub = pd.read_csv('./input/sample_submission.csv')
            #sub['PredictedLogRevenue'] = preds
            self.x_test['PredictedLogRevenue'] = preds
            self.x_test[['PredictedLogRevenue']].to_csv(
                '../output/submission/{}.csv'.format(self.name), index=True)
            #preds.to_csv('../output/{}.csv'.format(self.name), index=True)

            cols = self.feature_importance_df[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:20].index
            self.logfile.write('top features:\n')
            for c in cols:
                self.logfile.write('{}\n'.format(c))

            self.logfile.flush()

            self.display_importances(self.feature_importance_df, self.name)

        # for stack
        np.save('../output/log/{}_oof_train'.format(self.name), oof_preds)
        np.save('../output/log/{}_oof_test'.format(self.name), preds)
        return self.feature_importance_df, full_rmse, oof_preds, preds
Exemplo n.º 5
0
    def __init__(self,
                 name,
                 clf,
                 comment=None,
                 remove_columns=None,
                 param=None,
                 lgb_seed=None,
                 n_estimators=1000,
                 log=None,
                 predict_feats=False,
                 debug=True,
                 BASE_X_PATH=None,
                 TRN_PRED_FEATS=None,
                 TES_PRED_FEATS=None,
                 nfolds=5):

        self.name = name
        self.comment = comment

        if log is None:
            self.logfile = open('../output/log/{}.txt'.format(name), 'w')
        else:
            self.logfile = open('../output/log/{}.txt'.format(log), 'w')

        if param is None:
            self.param = {
                'objective': 'regression',
                'metric': 'rmse',
                'num_leaves': 32,
                'learning_rate': 0.03,
                'colsample_bytree': 0.9,
                'n_estimators': 1000,
                'boosting_type': 'goss'
            }
        else:
            self.param = param

        if lgb_seed is not None:
            self.param['seed'] = lgb_seed
        self.param['n_estimators'] = n_estimators
        self.feature_importance_df = None
        self.Tunparams = None
        if BASE_X_PATH is not None:
            self.x = feather.read_dataframe(BASE_X_PATH)
        if TRN_PRED_FEATS is not None:
            self.trn_preds_feats = np.load(TRN_PRED_FEATS)
        if TES_PRED_FEATS is not None:
            self.tes_preds_feats = np.load(TES_PRED_FEATS)

        if remove_columns is not None:
            drop_features = [
                _f for _f in self.x.columns if _f in remove_columns
            ]
            self.x.drop(drop_features, axis=1, inplace=True)
            del drop_features
            gc.collect()

        self.clf = clf(**self.param)

        #read & prepare datasets
        print('read & prepare datasets shape: {}'.format(self.x.shape))

        #split train & test sets
        self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split(
            self.x)

        #debug
        if debug:
            x_train_s = self.x_train.sample(frac=0.3)
            x_test_s = self.x_test.sample(frac=0.3)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]
        else:
            x_train_s = self.x_train.sample(frac=1)
            x_test_s = self.x_test.sample(frac=1)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]

        if predict_feats:
            self.x_train, trn_feats = add_pred_feats(x_train_s,
                                                     self.trn_preds_feats,
                                                     None)
            self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats,
                                            trn_feats)
            self.y_train = y_train_s.groupby('fullVisitorId').sum()
            del x_train_s, x_test_s, y_train_s
            gc.collect()
        else:
            self.x_train.reset_index(drop=True, inplace=True)
            self.x_test.reset_index(drop=True, inplace=True)

        self.folds = data_prepare.get_folds(
            df=self.x_train[['totals.pageviews']].reset_index(),
            n_splits=nfolds)