Пример #1
0
 def performance_per_era(self, data, model_name):
     print(model_name)
     df = self.df[model_name].to_frame(model_name)
     df = metrics_per_era(data, Report(df))[model_name]
     df = df.round(decimals={'logloss': 6, 'auc': 4, 'acc': 4, 'ystd': 4})
     with pd.option_context('display.colheader_justify', 'left'):
         print(df.to_string())
Пример #2
0
 def dominance(self, data, tournament=None, sort_by='corr'):
     """Mean (across eras) of fraction of models bested per era"""
     columns = ['corr', 'mse']
     mpe, regions = metrics_per_era(data, self, tournament, columns=columns)
     dfs = []
     for i, col in enumerate(columns):
         pivot = mpe.pivot(index='era', columns='pair', values=col)
         pairs = pivot.columns.tolist()
         a = pivot.values
         n = a.shape[1] - 1.0
         if n == 0:
             raise ValueError("Must have at least two pairs")
         m = []
         for j in range(pivot.shape[1]):
             if col == 'corr':
                 z = (a[:, j].reshape(-1, 1) < a).sum(axis=1) / n
             else:
                 z = (a[:, j].reshape(-1, 1) > a).sum(axis=1) / n
             m.append(z.mean())
         df = pd.DataFrame(data=m, index=pairs, columns=[col])
         dfs.append(df)
     df = pd.concat(dfs, axis=1)
     df = add_split_pairs(df)
     df = df.sort_values([sort_by], ascending=[False])
     return df
Пример #3
0
 def metrics_per_era(self, data, metrics=['logloss', 'auc', 'acc', 'ystd'],
                     era_as_str=True):
     "DataFrame containing given metrics versus era (as index)"
     metrics, regions = metrics_per_era(data, self, columns=metrics,
                                        era_as_str=era_as_str)
     metrics.index = metrics['era']
     metrics = metrics.drop(['era'], axis=1)
     return metrics
Пример #4
0
def test_metrics_per_era():
    "make sure metrics_per_era runs"
    d = testing.micro_data()
    p = testing.micro_prediction()
    metrics_per_era(d, p, 1)
    metrics_per_era(d, p, 2, join='yhat')
    metrics_per_era(d, p, 3, join='inner')
    assert_raises(ValueError, metrics_per_era, d, p, 4, 'outer')
    with testing.HiddenPrints():
        metrics_per_era(d, p, tournament=5, era_as_str=True)
Пример #5
0
def test_metrics_per_era():
    "make sure calc_metrics runs"
    d = micro_data()
    p = micro_prediction()
    metrics_per_era(d, p)
    metrics_per_era(d, p, 'yhat')
    metrics_per_era(d, p, 'inner')
    assert_raises(ValueError, metrics_per_era, d, p, 'outer')
Пример #6
0
    def summary(self, data, tournament=None, round_output=True):
        """Performance summary of prediction object that contains a single pair"""

        if self.shape[1] != 1:
            raise ValueError("prediction must contain a single pair")

        # metrics
        metrics, regions = metrics_per_era(data,
                                           self,
                                           tournament,
                                           region_as_str=True,
                                           split_pairs=False)
        metrics = metrics.drop(['era', 'pair'], axis=1)

        # additional metrics
        region_str = ', '.join(regions)
        nera = metrics.shape[0]
        corr = metrics['corr']
        consis = (corr > CORR_BENCHMARK).mean()

        # summary of metrics
        if tournament is None:
            t_str = self.tournaments(as_str=True)[0]
        else:
            t_str = nx.tournament_str(tournament)
        m1 = metrics.mean(axis=0).tolist() + ['tourn', t_str]
        m2 = metrics.std(axis=0).tolist() + ['region', region_str]
        m3 = metrics.min(axis=0).tolist() + ['eras', nera]
        m4 = metrics.max(axis=0).tolist() + ['consis', consis]
        data = [m1, m2, m3, m4]

        # make dataframe
        columns = metrics.columns.tolist() + ['stats', '']
        df = pd.DataFrame(data=data,
                          index=['mean', 'std', 'min', 'max'],
                          columns=columns)

        # make output (optionally) pretty
        if round_output:
            round_dict = {'corr': 6, 'mse': 4, 'ystd': 4}
            df = df.round(decimals=round_dict)

        return df
Пример #7
0
 def performance(self, data):
     metrics = metrics_per_era(data, self)
     metrics = metrics['yhat']
     regions = data.unique_region().tolist()
     regions = ', '.join(regions)
     print("      logloss   auc     acc     ystd")
     fmt = "{:<4}  {:8.6f}  {:6.4f}  {:6.4f}  {:6.4f}{extra}"
     extra = "  |  {:<7}  {:<}".format('region', regions)
     print(fmt.format('mean', *metrics.mean(axis=0), extra=extra))
     extra = "  |  {:<7}  {:<}".format('eras', metrics.shape[0])
     print(fmt.format('std', *metrics.std(axis=0), extra=extra))
     consistency = (metrics['logloss'] < np.log(2)).mean()
     extra = "  |  {:<7}  {:<.4f}".format('consis', consistency)
     print(fmt.format('min', *metrics.min(axis=0), extra=extra))
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', '', RuntimeWarning)
         prctile = np.percentile(metrics['logloss'], 75)
     extra = "  |  {:<7}  {:<.4f}".format('75th', prctile)
     print(fmt.format('max', *metrics.max(axis=0), extra=extra))
Пример #8
0
    def performance_df(self, data):

        # calc performance
        metrics = metrics_per_era(data, self)
        regions = data.unique_region().tolist()
        models = list(metrics.keys())
        nera = metrics[models[0]].shape[0]
        regera = ', '.join(regions) + '; %d' % nera + ' eras'

        # create dataframe of performance
        cols = ['logloss', 'auc', 'acc', 'ystd', 'consis', '(%s)' % regera]
        df = pd.DataFrame(columns=cols)
        for i, model in enumerate(models):
            metric_df = metrics[model]
            metric = metric_df.mean(axis=0).tolist()
            consis = (metric_df['logloss'] < np.log(2)).mean()
            metric.extend([consis, model])
            df.loc[i] = metric

        return df
Пример #9
0
 def metrics_per_era(self,
                     data,
                     tournament=None,
                     metrics=['corr', 'mse', 'ystd'],
                     era_as_str=True,
                     split_pairs=True):
     "DataFrame containing given metrics versus era (as index)"
     metrics, regions = metrics_per_era(data,
                                        self,
                                        tournament,
                                        columns=metrics,
                                        era_as_str=era_as_str)
     metrics.index = metrics['era']
     metrics = metrics.drop(['era'], axis=1)
     if split_pairs:
         pair = metrics['pair']
         metrics = metrics.drop('pair', axis=1)
         metrics.insert(0, 'pair', pair)
     else:
         metrics = metrics.drop('pair', axis=1)
     return metrics
Пример #10
0
    def summary(self, data, round_output=True):
        "Performance summary of prediction object that contains a single name"

        if self.shape[1] != 1:
            raise ValueError("prediction must contain a single name")

        # metrics
        metrics, regions = metrics_per_era(data, self, region_as_str=True)
        metrics = metrics.drop(['era', 'name'], axis=1)

        # additional metrics
        region_str = ', '.join(regions)
        nera = metrics.shape[0]
        logloss = metrics['logloss']
        consis = (logloss < LOGLOSS_BENCHMARK).mean()
        sharpe = (LOGLOSS_BENCHMARK - logloss).mean() / logloss.std()

        # summary of metrics
        m1 = metrics.mean(axis=0).tolist() + ['region', region_str]
        m2 = metrics.std(axis=0).tolist() + ['eras', nera]
        m3 = metrics.min(axis=0).tolist() + ['sharpe', sharpe]
        m4 = metrics.max(axis=0).tolist() + ['consis', consis]
        data = [m1, m2, m3, m4]

        # make dataframe
        columns = metrics.columns.tolist() + ['stats', '']
        df = pd.DataFrame(data=data,
                          index=['mean', 'std', 'min', 'max'],
                          columns=columns)

        # make output (optionally) pretty
        if round_output:
            round_dict = {'logloss': 6, 'auc': 4, 'acc': 4, 'ystd': 4}
            df = df.round(decimals=round_dict)

        return df