Пример #1
0
    def evaluate(self, sales_pred, sales_true=None, mode='WSPL'):
        """Evaluate performance without re-calculating weights and scales
        mode can be either 'WSPL' or 'WRMSSE'.
        """
        # sales_true can be provided to update the true sales
        if sales_true is not None:
            self.ref.sales_true = sales_true

        # we will alter the df, so copy it
        sales_pred = sales_pred.copy()

        # rewrite columns to match the Referee's columns
        day_nums_pred = select_day_nums(sales_pred, as_int=False)
        day_nums_true = select_day_nums(self.ref.sales_true, as_int=False)

        pred_cols = list(sales_pred.columns)
        for source_day, target_day in zip(day_nums_pred, day_nums_true):
            idx = pred_cols.index(source_day)
            pred_cols[idx] = target_day

        sales_pred.columns = pred_cols
        if mode == 'WSPL':
            return self.ref.evaluate_WSPL(sales_pred)
        elif mode == 'WRMSSE':
            return self.ref.evaluate_WRMSSE(sales_pred)
Пример #2
0
    def calc_SPL(self,
                 quantiles_pred,
                 level=None,
                 groupby=None,
                 weights=None,
                 scale=None,
                 clip_zero=False):
        """Calculate the Scaled Pinball Loss for a given aggregation level"""
        if level:
            if groupby is None: groupby = self.aggregation_levels[level]
            if weights is None: weights = self.weights[level]
            if scale is None: scale = self.scales_WSPL[level]
        assert weights is not None, "Provide level or weights"
        assert scale is not None, "Provide level or scale"

        # Select the correct predictions and true sales based on the input level
        predictions = quantiles_pred[quantiles_pred['level'] == level]
        true_sales = self.sales_true_quantiles[
            self.sales_true_quantiles['level'] == level]

        if clip_zero:
            d_cols = select_day_nums(predictions, as_int=False)
            predictions[d_cols] = predictions[d_cols].clip(lower=0)

        # Make sure that both the predictions and the true sales have the same
        # id list, otherwise our calculation will go wrong
        predictions = predictions.sort_values('id')
        true_sales = true_sales.sort_values('id')

        # Convert to numpy array
        d_cols = select_day_nums(predictions, as_int=False)
        predictions = predictions[d_cols].values
        true_sales = true_sales[d_cols].values

        # Error
        err = true_sales - predictions

        # Number of rows
        Nlevel = predictions.shape[0]

        # Dummy array to save losses in
        losses = np.zeros(Nlevel // 9)
        for i in range(Nlevel // 9):
            indices = np.arange(i * 9,
                                (i + 1) * 9)  # per set of 9, take indices
            subset = err[indices]  # Take subset out of real set
            res = np.mean(
                np.sum(np.amax(np.array([
                    self.quantiles * subset.T, (self.quantiles - 1) * subset.T
                ]),
                               axis=0),
                       axis=0))  #compute PL of set
            losses[i] = res  # Save resulting PL

        loss = np.sum(
            np.array(losses * weights) /
            np.array(self.h * scale))  # Calculate SPL of aggregate level

        return loss
Пример #3
0
    def __init__(self,
                 df,
                 features,
                 labels,
                 window_in,
                 window_out,
                 dilation=1,
                 lag=0,
                 batch_size=32,
                 shuffle=True,
                 ensure_all_samples=False):
        """Initialization"""
        # Save a reference to the df
        self.df = df
        self.features = features
        self.labels = labels

        # Save hyperparameters
        self.window_in = window_in
        self.window_out = window_out
        self.dilation = dilation
        self.lag = lag
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.ensure_all_samples = ensure_all_samples

        # Set up list of start indices for the validation set
        # From those the other indices will be calculated
        # Have 1 + (window_in - 1) * dilation training samples
        # Need `lag` days between training and validation samples
        train_day_span = 1 + (window_in - 1) * dilation
        start_val_day_min = min(select_day_nums(df,
                                                axis=0)) + train_day_span + lag
        start_val_day_max = max(select_day_nums(df, axis=0)) - window_out + 1
        self.list_start_val_days = np.arange(start_val_day_min,
                                             start_val_day_max + 1)

        # initialize indices
        self.indexes = None
        self.on_epoch_end()

        # calculate properties
        self.n = len(self.list_start_val_days)
        if isinstance(features, dict):
            self.n_features = {
                key: len(feats)
                for (key, feats) in features.items()
            }
        else:
            self.n_features = len(features)
        self.n_labels = len(labels)
Пример #4
0
def plot_some_confidence_intervals(df,
                                   val_batch_creator,
                                   level,
                                   quantiles,
                                   data_dir='data/',
                                   num=9,
                                   plot_shape=(3, 3)):
    indices = range(num)
    norm = pd.read_csv(data_dir + 'prep/norm_level_{}.csv'.format(level))

    f, axes = plt.subplots(nrows=plot_shape[0],
                           ncols=plot_shape[1],
                           figsize=(18, 6 * plot_shape[0]))

    for idx, ax in zip(indices, np.ravel(axes)):
        quantile_preds = {}
        d_cols = select_day_nums(df, as_int=False)

        for i, q in enumerate(quantiles):
            selected_series = df.loc[df['quantile'] == q].iloc[idx]
            quantile_preds[q] = selected_series[d_cols].values.astype(float)

        series_id = "_".join(selected_series['id'].split('_')
                             [0:-2])  # e.g. FOODS_1_010_X_0.995_evaluation
        true_sales = val_batch_creator.df.loc[(
            val_batch_creator.df['id'] == series_id), 'demand']
        series_norm = norm.loc[norm['id'] == series_id].norm.values[0]
        quantile_preds['true'] = (true_sales * series_norm).values
        quantile_preds['label'] = series_id

        # plot
        plot_confidence_series(quantile_preds, quantiles=quantiles, ax=ax)

    plt.tight_layout()
    plt.show()
Пример #5
0
    def validate(self):
        ls = []

        for fold in self.folds:
            sales_train, sales_true = self.cv_generator.get_train_val_split(
                fold=fold, train_size=self.window_in)

            sales_true_aggregated = sales_true.groupby(['store_id']).sum()
            train_df, norm = self.preprocess_func(sales_train,
                                                  prices=self.prices,
                                                  calendar=self.calendar,
                                                  norm=self.train_norm)

            # select days to predict
            val_day_nums = select_day_nums(sales_true_aggregated)
            sales_pred = self.agent.predict(train_df, val_day_nums)

            store_WRMSSE = self.ref.calc_WRMSSE(
                sales_true=sales_true_aggregated,
                sales_pred=sales_pred.T,
                groupby=None,
                weights=self.ref.weights[3],
                scale=self.ref.scales[3])
            ls.append(store_WRMSSE)

        return np.mean(ls), ls
Пример #6
0
def test_cv_generator(verbose=True):
    """Test the CrossValiDataGenerator"""
    # Load data
    calendar, sales_train_validation, sell_prices = load_data()

    # Set up generator
    cv_generator = CrossValiDataGenerator(sales_train_validation,
                                          train_size=28)
    train, _ = cv_generator.get_train_val_split(fold=10, train_size=-1)

    # Select train & test sets for ten folds
    val_days_seen = []
    for fold in range(1, 1 + 10):
        train_df, val_df = cv_generator.get_train_val_split(fold=fold)
        # select days of validation set
        d_list = select_day_nums(val_df)
        # assert none of these days were previously seen
        for d in d_list:
            assert (d not in val_days_seen), "Validation day already seen"

        # add to list of seen days
        val_days_seen.extend(d_list)

    if verbose:
        print("Validation days seen: ", val_days_seen)

    # the final 28*10 days should have been seen, not including the public leaderboard
    for d in range(1913 - 28 * 10 + 1, 1913 + 1):
        assert d in val_days_seen, "Validation day {} should have been seen".format(
            d)
Пример #7
0
    def predict(self, sales_train, day_start=None, day_end=None):
        """Predict the next 28 days based on the mean times the weekly pattern"""
        if day_start is None:
            # infer start day as first day after training set
            day_start = select_final_day(sales_train) + 1
        if day_end is None:
            # infer end day as start day + 28 day period
            day_end = day_start + 28 - 1  # inclusive range

        # get skeleton for predictions
        pred = get_empty_predictions()

        # determine mean of last k days, per product
        df = select_dates(sales_train, num_days=self.k, day_end=select_final_day(sales_train))
        d_cols = select_day_nums(df, as_int=False)
        weekly_mean = df[d_cols].mean(axis=1) * 7

        for day_num in range(day_start, day_end+1):
            d_day = 'd_%d' % day_num
            week_day = self.calendar[self.calendar.d == d_day].wday.values[0]
            portion = self.portions[week_day - 1]
            # set all predictions to training mean of last k days times weekly pattern
            pred[d_day] = weekly_mean * portion

        return pred
Пример #8
0
def convert_true_sales_to_quantiles(sales_true,
                                    aggregation_levels,
                                    postfix='_evaluation',
                                    verbose=1):
    sales_true = sales_true.copy()

    # Quantiles
    quantiles = [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995]
    d_cols = select_day_nums(sales_true, as_int=False)

    sales_true_quantiles = pd.DataFrame()
    iterator = aggregation_levels.items()
    if verbose == 1:
        print("Converting true sales to quantile form")
    elif verbose == 2:
        iterator = tqdm(iterator, desc="True sales to quantiles")
    for level, groupby in iterator:
        if groupby:
            group = sales_true.groupby(groupby).sum()
        else:
            d_cols = select_day_nums(sales_true, as_int=False)
            group = sales_true[d_cols]

        group_ = pd.DataFrame()
        for quantile in quantiles:
            g_ = group.copy()
            vals = g_.index.values
            if level == 1:
                ids = [
                    "Total_X_{:.3f}".format(quantile) + postfix for val in vals
                ]
            if level in [2, 3, 4, 5, 10]:
                ids = [
                    val + "_X_{:.3f}".format(quantile) + postfix
                    for val in vals
                ]
            if level in [6, 7, 8, 9]:
                ids = [
                    val[0] + '_' + val[1] + "_{:.3f}".format(quantile) +
                    postfix for val in vals
                ]
            if level == 11:
                ids = [
                    val[1] + '_' + val[0] + "_{:.3f}".format(quantile) +
                    postfix for val in vals
                ]
            if level == 12:
                ids = [
                    val.replace('_validation', '') +
                    "_{:.3f}".format(quantile) + postfix for val in vals
                ]

            g_['quantile'] = quantile
            g_['level'] = level
            g_['id'] = ids

            group_ = group_.append(g_)

        sales_true_quantiles = sales_true_quantiles.append(group_)

    # convert quantiles to float
    conv_dict = {d_col: 'float64' for d_col in d_cols}
    sales_true_quantiles = sales_true_quantiles.astype(conv_dict)
    # sales_true_quantiles.columns = ['F%d' % int(i+1) if x =='d_1' + str(i+886) else x
    #                                 for i, x in enumerate(sales_true_quantiles.columns)]

    return sales_true_quantiles