def main(): X, y = load_boston(return_X_y=True) df_x = pd.DataFrame(X) entry = BostonProcessFeature(name='boston_base', root_dir='./boston_ens') singles = [ RidgeOutOfFold(name='ridge', parent=entry), XGBoostRegressorOutOfFold(name='xgb', parent=entry), LGBMRegressorOutOfFold(name='lgbm', parent=entry), RFRegressorFeatureOutOfFold(name='rf', parent=entry) ] ens = EnsembleFeature(parent=singles, name='ensumble', agg='mean') ens2 = EnsembleFeature(parent=[ens, *singles], name='ens2', agg='mean') df = pd.DataFrame() f_df = ens2.fit(df_x, y) df = pd.concat([df, f_df], axis=1) for i, cols in df.T.iterrows(): score = regression_metrics(y, cols.values) print(cols.name, score) ens.predict(df_x)
def fit(self, train_df, y, test_df) -> [pd.DataFrame, dict]: """ 学習の実行 Args: train_df: y: test_df: Returns: metric dataframe and predicts dict to test set. metric_df: index: model_name columns: metric_name (in regression metrics) predicts: key: model_name value: np.ndarray. shape = (n_test,) """ metric_df = None oof_all_df = pd.DataFrame() predict = {} for model in self.models: oof_df = model.fit(train_df, y) pred_df = model.predict(test_df) if model.is_recording: sub_df = read_sample_submit() sub_df['age'] = pred_df.values[:, 0] sub_df.to_csv(os.path.join(model.output_dir, 'predict.csv'), index=False) predict[model.name] = pred_df.values[:, 0] oof_all_df = pd.concat([oof_all_df, oof_df], axis=1) metric_i = regression_metrics(y, oof_df.values[:, 0]).rename(columns={'score': model.name}) if metric_df is None: metric_df = metric_i else: metric_df = pd.concat([metric_df, metric_i], axis=1) metric_df = metric_df.T.sort_values('rmse') metric_df.to_csv(os.path.join(self.root_dir, 'metrics.csv')) oof_all_df.to_csv(os.path.join(self.root_dir, 'out_of_fold.csv'), index=False) self.out_of_fold_df_ = oof_all_df self.metric_df_ = metric_df try: g = sns.clustermap(self.out_of_fold_df_.corr(), cmap='viridis') g.fig.tight_layout() g.fig.savefig(os.path.join(self.root_dir, 'out-of-fold-cluster.png'), dpi=120) except Exception as e: print(e) return metric_df, predict
def main(): X, y = load_boston(return_X_y=True) df_x = pd.DataFrame(X) entry = BostonProcessFeature(name='boston_base', root_dir='./boston_simple') # output to `./boston_simple` basic_xgb_feature = XGBoostRegressorOutOfFold(name='xgb_simple', parent=entry) # normal XGBoost Model df = basic_xgb_feature.fit(df_x, y, force=True) # fit for i, cols in df.T.iterrows(): score = regression_metrics(y, cols.values) # calculate regression metrics print(cols.name, score)
def call(self, feature_instance: BaseOutOfFoldFeature, source_df: pd.DataFrame, y: np.ndarray, oof: np.ndarray): if feature_instance.is_regression_model: metric_df = regression_metrics(y, oof) else: metric_df = binary_metrics(y, oof) feature_instance.exp_backend.mark('train_metrics', metric_df['score']) s_metric = tabulate(metric_df.T, headers='keys') for s in s_metric.split('\n'): feature_instance.logger.info(s) return [ ('metrics.csv', metric_df) ]
def call(self, env: EvaluationEnv): if not hasattr(env.block, 'is_regression_model'): return y = env.y oof = env.output_df.values experiment = env.experiment if env.block.is_regression_model: score = regression_metrics(y, oof) elif env.block._output_dim == 1: score = binary_metrics(y, oof) else: score = multiclass_metrics(y, oof) experiment.mark('train_metrics', score) if not self.show_to_log: return lines = to_pretty_lines(score) experiment.logger.info('=' * 20 + ' whole oof score ' + '=' * 20) for l in lines: experiment.logger.info(l)
from vivid.model_selection import ContinuousStratifiedFold if __name__ == '__main__': X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7) train_df = pd.DataFrame(X_train) cv = ContinuousStratifiedFold(n_splits=5, q=20, shuffle=True, random_state=71) xgb = XGBRegressorBlock('xgb_stratified', cv=cv) xgb_simple = XGBRegressorBlock('xgb_simple', cv=5) runner = create_runner([xgb, xgb_simple]) runner.fit(train_df, y_train) test_df = pd.DataFrame(X_test) results = runner.predict(test_df) eval_scores = [] for result in results: score = regression_metrics(y_test, result.out_df.values[:, 0]) eval_scores.append(pd.Series(score, name=result.block.name)) eval_df = pd.DataFrame(eval_scores) from tabulate import tabulate print(tabulate(eval_df, headers='keys'))
def test_regression_metrics(): y_true = np.random.uniform(size=(100, )) y_pred = y_true + 1. regression_metrics(y_true, y_pred)