Exemplo n.º 1
0
def test_loading():
    X, y = load_data()
    assert 'cnt' not in X.columns
    assert 'registered' not in X.columns
    assert 'casual' not in X.columns
    assert set(y.columns) == {'cnt', 'registered', 'casual'}
    assert (X.index == y.index).all()
Exemplo n.º 2
0
def test_featuretransformer():
    X, y = load_data()
    ft = FeatureTransformer(remove_year=True, categorical=False)
    X_tr = ft.fit_transform(X)
    assert len(X_tr.columns) == 11
    assert len(X_tr) == len(X)
    assert (X.index == X_tr.index).all()

    ft = FeatureTransformer(remove_year=False, categorical=False)
    X_tr = ft.fit_transform(X)
    assert len(X_tr.columns) == 12
    assert len(X_tr) == len(X)
    assert (X.index == X_tr.index).all()

    ft = FeatureTransformer(remove_year=True, categorical=True)
    X_tr = ft.fit_transform(X)
    assert len(X_tr.columns) == 57
    assert len(X_tr) == len(X)
    assert (X.index == X_tr.index).all()

    ft = FeatureTransformer(remove_year=False, categorical=True)
    X_tr = ft.fit_transform(X)
    assert len(X_tr.columns) == 58
    assert len(X_tr) == len(X)
    assert (X.index == X_tr.index).all()
Exemplo n.º 3
0
def get_periodical_testset():
    X, y = load_data()
    y_cnt = y['cnt']
    trend = np.linspace(1, 2, len(y_cnt))
    periodical = 2000 * (np.abs(np.sin(
        np.linspace(0, 1, len(y_cnt)) * 2 * np.pi)) + 1)
    y_cnt[:] = trend * periodical
    return y_cnt, periodical, trend
Exemplo n.º 4
0
def test_trendremover_real():
    X, y = load_data()
    y = y['cnt']
    trend_remover = TrendRemover(remove_trend=True)
    y_trans = np.array(trend_remover.fit_transform(y))
    # fig, ax = plt.subplots(1, 1)
    # ax.plot(np.linspace(1,2,len(y)), np.array(y).ravel(), color='black')
    # ax.plot(np.linspace(1,2,len(y)), np.array(y_trans).ravel(), color='grey')
    # plt.show()
    assert np.std(y_trans) < 0.5*np.std(y)
Exemplo n.º 5
0
def test_bikeshareregression():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=False)

    ft = FeatureTransformer()
    X_train = ft.fit_transform(X_train)
    pr = TrendRemover(remove_trend=True).fit(y_train['cnt'])
    bsr = BikeshareRegression(trend_remover=pr,
                              random_state=42).fit(X_train, y_train['cnt'])

    X_test = ft.transform(X_test)
    y_pred = bsr.predict(X_test)

    # test default regressor, this isn't the best possible result.
    mae = mean_absolute_error(y_test['cnt'], y_pred)
    assert mae < 45
Exemplo n.º 6
0
def plot_grouped_usage(best_clf_casual, best_clf_registered, data_dir,
                       plot_output_dir):
    X, y = load_data(data_dir)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=False)

    y_pred_casual = best_clf_casual.predict(X_test)
    y_pred_registered = best_clf_registered.predict(X_test)
    y_pred = y_test.copy()
    y_pred['registered'] = np.array(y_pred_registered)
    y_pred['casual'] = np.array(y_pred_casual)

    for dataset, y_plots, y_names in zip(
        ['alldata', 'predicttest'], [[y], [y_pred, y_test]],
        [['all'], ['prediction', 'test set']]):
        for time_interval in ['H', 'D', 'M']:
            fig, ax = plt.subplots(1, 1)
            for y_plot, y_name in zip(y_plots, y_names):
                for users, c in zip(['registered', 'casual'],
                                    ['steelblue', 'crimson']):
                    y_cnt = y_plot[users]
                    if time_interval == 'H':
                        y_grouped = y_cnt.groupby(y_cnt.index.hour)
                        x_grouped = np.unique(y_cnt.index.hour)
                    elif time_interval == 'D':
                        y_grouped = y_cnt.groupby(y_cnt.index.weekday)
                        x_grouped = np.unique(y_cnt.index.weekday)
                    elif time_interval == 'M':
                        y_grouped = y_cnt.groupby(y_cnt.index.month)
                        x_grouped = np.unique(y_cnt.index.month)
                    y_mean = y_grouped.mean()
                    y_sem = y_grouped.aggregate(lambda g: sem(g, axis=None))

                    if y_name == 'prediction':
                        ls = 'dashed'
                        label = f"pred. {users}"
                    else:
                        ls = 'solid'
                        label = users
                    ax.plot(x_grouped,
                            y_mean,
                            lw=2,
                            color=c,
                            linestyle=ls,
                            label=label)
                    ax.fill_between(x_grouped,
                                    y_mean - y_sem,
                                    y_mean + y_sem,
                                    color=c,
                                    lw=0,
                                    alpha=0.2,
                                    label=None)

                    xticks = list(
                        np.arange(x_grouped.min(),
                                  x_grouped.max() + 1, 1))
                    if time_interval == 'H':
                        xticklabels = [
                            f"${xt}$" if i % 2 == 0 else ""
                            for i, xt in enumerate(xticks)
                        ]
                    else:
                        xticklabels = [f"${xt}$" for xt in xticks]
                    ax.set_xticks(xticks)
                    ax.set_xticklabels(xticklabels)
                    ax.set_xlabel(time_interval)
                    ax.set_ylabel("Mean No. of rentals per hour")
                    ax.legend(frameon=False)
            fig.savefig(os.path.join(
                plot_output_dir, f'mean_usage_{dataset}_{time_interval}.pdf'),
                        bbox_inches='tight')
Exemplo n.º 7
0
def plot_grouped_usage_bias(best_clf_casual, best_clf_registered, data_dir,
                            plot_output_dir):
    X, y = load_data(data_dir)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=False)

    y_pred_casual = best_clf_casual.predict(X_test)
    y_pred_registered = best_clf_registered.predict(X_test)
    y_pred = y_test.copy()
    y_pred['registered'] = np.array(y_pred_registered)
    y_pred['casual'] = np.array(y_pred_casual)

    for time_interval in ['H', 'D', 'M']:
        fig, ax = plt.subplots(1, 1)
        for user, c in zip(['registered', 'casual'], ['steelblue', 'crimson']):
            dy = y_pred[user] - y_test[user]
            for evl, ls in zip(['bias', 'mad'], ['dashed', 'solid']):
                if evl == 'mad':
                    dy = dy.abs()
                if time_interval == 'H':
                    dy_gr = dy.groupby(dy.index.hour)
                    x_gr = np.unique(dy.index.hour)
                elif time_interval == 'D':
                    dy_gr = dy.groupby(dy.index.weekday)
                    x_gr = np.unique(dy.index.weekday)
                elif time_interval == 'M':
                    dy_gr = dy.groupby(dy.index.month)
                    x_gr = np.unique(dy.index.month)

                dy_mean = dy_gr.mean()
                dy_sem = dy_gr.aggregate(lambda g: sem(g, axis=None))
                ax.plot(x_gr,
                        dy_mean,
                        lw=2,
                        color=c,
                        linestyle=ls,
                        label=f"{evl} {user}")
                ax.fill_between(x_gr,
                                dy_mean - dy_sem,
                                dy_mean + dy_sem,
                                color=c,
                                lw=0,
                                alpha=0.2,
                                label=None)

        xticks = list(np.arange(x_gr.min(), x_gr.max() + 1, 1))
        if time_interval == 'H':
            xticklabels = [
                f"${xt}$" if i % 2 == 0 else "" for i, xt in enumerate(xticks)
            ]
        else:
            xticklabels = [f"${xt}$" for xt in xticks]
        ax.axhline(y=0, c='k', ls=':', lw=1.5)
        ax.set_xticks(xticks)
        ax.set_xticklabels(xticklabels)
        ax.set_xlabel(time_interval)
        ax.set_ylabel("Mean bias / mad per hour")
        ax.set_ylim(-40, 100)
        ax.legend(loc=2, frameon=False)
        fig.savefig(os.path.join(plot_output_dir,
                                 f'bias_mad_{time_interval}.pdf'),
                    bbox_inches='tight')