示例#1
0
def main():
    X, y = load_boston(return_X_y=True)
    df_x = pd.DataFrame(X)

    entry = BostonProcessFeature(name='boston_base', root_dir='./boston_ens')

    singles = [
        RidgeOutOfFold(name='ridge', parent=entry),
        XGBoostRegressorOutOfFold(name='xgb', parent=entry),
        LGBMRegressorOutOfFold(name='lgbm', parent=entry),
        RFRegressorFeatureOutOfFold(name='rf', parent=entry)
    ]

    ens = EnsembleFeature(parent=singles, name='ensumble', agg='mean')
    ens2 = EnsembleFeature(parent=[ens, *singles], name='ens2', agg='mean')

    df = pd.DataFrame()
    f_df = ens2.fit(df_x, y)
    df = pd.concat([df, f_df], axis=1)

    for i, cols in df.T.iterrows():
        score = regression_metrics(y, cols.values)
        print(cols.name, score)

    ens.predict(df_x)
示例#2
0
    def fit(self, train_df, y, test_df) -> [pd.DataFrame, dict]:
        """
        学習の実行

        Args:
            train_df:
            y:
            test_df:

        Returns:
            metric dataframe and predicts dict to test set.

            metric_df:
                index: model_name
                columns: metric_name (in regression metrics)

            predicts:
                key: model_name
                value: np.ndarray. shape = (n_test,)
        """
        metric_df = None
        oof_all_df = pd.DataFrame()

        predict = {}

        for model in self.models:
            oof_df = model.fit(train_df, y)
            pred_df = model.predict(test_df)
            if model.is_recording:
                sub_df = read_sample_submit()
                sub_df['age'] = pred_df.values[:, 0]
                sub_df.to_csv(os.path.join(model.output_dir, 'predict.csv'), index=False)

            predict[model.name] = pred_df.values[:, 0]

            oof_all_df = pd.concat([oof_all_df, oof_df], axis=1)

            metric_i = regression_metrics(y, oof_df.values[:, 0]).rename(columns={'score': model.name})
            if metric_df is None:
                metric_df = metric_i
            else:
                metric_df = pd.concat([metric_df, metric_i], axis=1)

        metric_df = metric_df.T.sort_values('rmse')
        metric_df.to_csv(os.path.join(self.root_dir, 'metrics.csv'))
        oof_all_df.to_csv(os.path.join(self.root_dir, 'out_of_fold.csv'), index=False)

        self.out_of_fold_df_ = oof_all_df
        self.metric_df_ = metric_df

        try:
            g = sns.clustermap(self.out_of_fold_df_.corr(), cmap='viridis')
            g.fig.tight_layout()
            g.fig.savefig(os.path.join(self.root_dir, 'out-of-fold-cluster.png'), dpi=120)
        except Exception as e:
            print(e)

        return metric_df, predict
示例#3
0
def main():
    X, y = load_boston(return_X_y=True)
    df_x = pd.DataFrame(X)

    entry = BostonProcessFeature(name='boston_base', root_dir='./boston_simple')  # output to `./boston_simple`

    basic_xgb_feature = XGBoostRegressorOutOfFold(name='xgb_simple', parent=entry)  # normal XGBoost Model
    df = basic_xgb_feature.fit(df_x, y, force=True)  # fit

    for i, cols in df.T.iterrows():
        score = regression_metrics(y, cols.values)  # calculate regression metrics
        print(cols.name, score)
示例#4
0
    def call(self, feature_instance: BaseOutOfFoldFeature, source_df: pd.DataFrame, y: np.ndarray, oof: np.ndarray):
        if feature_instance.is_regression_model:
            metric_df = regression_metrics(y, oof)
        else:
            metric_df = binary_metrics(y, oof)

        feature_instance.exp_backend.mark('train_metrics', metric_df['score'])

        s_metric = tabulate(metric_df.T, headers='keys')
        for s in s_metric.split('\n'):
            feature_instance.logger.info(s)

        return [
            ('metrics.csv', metric_df)
        ]
示例#5
0
    def call(self, env: EvaluationEnv):

        if not hasattr(env.block, 'is_regression_model'):
            return

        y = env.y
        oof = env.output_df.values
        experiment = env.experiment

        if env.block.is_regression_model:
            score = regression_metrics(y, oof)
        elif env.block._output_dim == 1:
            score = binary_metrics(y, oof)
        else:
            score = multiclass_metrics(y, oof)
        experiment.mark('train_metrics', score)

        if not self.show_to_log:
            return

        lines = to_pretty_lines(score)
        experiment.logger.info('=' * 20 + ' whole oof score ' + '=' * 20)
        for l in lines:
            experiment.logger.info(l)
示例#6
0
from vivid.model_selection import ContinuousStratifiedFold

if __name__ == '__main__':
    X, y = load_boston(return_X_y=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)
    train_df = pd.DataFrame(X_train)
    cv = ContinuousStratifiedFold(n_splits=5,
                                  q=20,
                                  shuffle=True,
                                  random_state=71)
    xgb = XGBRegressorBlock('xgb_stratified', cv=cv)
    xgb_simple = XGBRegressorBlock('xgb_simple', cv=5)

    runner = create_runner([xgb, xgb_simple])
    runner.fit(train_df, y_train)

    test_df = pd.DataFrame(X_test)
    results = runner.predict(test_df)

    eval_scores = []
    for result in results:
        score = regression_metrics(y_test, result.out_df.values[:, 0])
        eval_scores.append(pd.Series(score, name=result.block.name))

    eval_df = pd.DataFrame(eval_scores)

    from tabulate import tabulate

    print(tabulate(eval_df, headers='keys'))
示例#7
0
def test_regression_metrics():
    y_true = np.random.uniform(size=(100, ))
    y_pred = y_true + 1.
    regression_metrics(y_true, y_pred)