Пример #1
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
Пример #2
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
Пример #3
0
def test_percent_format():
    formatter = percent_format()
    # same/nearly same precision values
    assert formatter([.12, .23, .34, .45]) == \
        ['12%', '23%', '34%', '45%']

    assert formatter([.12, .23, .34, 4.5]) == \
        ['12%', '23%', '34%', '450%']

    # mixed precision values
    assert formatter([.12, .23, .34, 45]) == \
        ['10%', '20%', '30%', '4500%']
Пример #4
0
def test_percent_format():
    formatter = percent_format()
    # same/nearly same precision values
    assert formatter([.12, .23, .34, .45]) == \
        ['12%', '23%', '34%', '45%']

    assert formatter([.12, .23, .34, 4.5]) == \
        ['12%', '23%', '34%', '450%']

    # mixed precision values
    assert formatter([.12, .23, .34, 45]) == \
        ['10%', '20%', '30%', '4500%']
Пример #5
0
def plot_elos():
    diffs = np.linspace(-1000, +1000)
    rates = 1 / (1 + 10**(-diffs / 400))
    df = pd.DataFrame({'elo': diffs, 'winrate': rates})

    return (pn.ggplot(df) + pn.geom_line(pn.aes(x='elo', y='winrate')) +
            pn.geom_vline(xintercept=0, alpha=.1) +
            pn.geom_hline(yintercept=.5, alpha=.1) +
            pn.labs(x='Own Elo relative to opponent\'s Elo',
                    y='Win rate v. opponent') +
            pn.scale_y_continuous(labels=percent_format()) +
            pn.coord_cartesian(expand=False) + plot.IEEE())
Пример #6
0
def plot_calibrations():
    params = data.sample_calibrations()
    return (
        pn.ggplot(
            params,
            pn.aes(xmin='boardsize-.25',
                   xmax='boardsize+.25',
                   group='boardsize',
                   fill='factor(boardsize)')) +
        pn.geom_hline(yintercept=.5, alpha=.2) + pn.geom_rect(
            pn.aes(ymin='lower', ymax='upper'), show_legend=False, color='k') +
        pn.geom_rect(pn.aes(ymin='mid', ymax='mid'),
                     show_legend=False,
                     color='k',
                     size=2) + pn.scale_y_continuous(labels=percent_format()) +
        pn.scale_fill_hue(l=.4) + pn.coord_cartesian(ylim=(.4, .6)) +
        pn.labs(y='Win rate v. perfect play', x='Board size') + plot.IEEE())
Пример #7
0
def plot_importance_lgb(importance):
    # Ugly but pip install problem on Airflow otherwise
    import plotnine as pn
    from plotnine import ggplot, aes  # noqa
    # from plotnine.geoms import *  # noqa
    coef = 1.5
    pn.options.figure_size = (6.4 * coef, 4.8 * coef)
    from mizani.formatters import percent_format  # noqa
    # from mizani.breaks import date_breaks  # noqa
    # from mizani.formatters import date_format  # noqa

    importance['importance'] = importance['importance'] / 100
    importance['feature'] = pd.Categorical(importance['feature'],
                                           importance['feature'][::-1],
                                           ordered=True)
    plot = (ggplot(importance, aes('feature', 'importance')) +
            pn.geom_bar(stat='identity') + pn.coords.coord_flip() +
            pn.scales.scale_y_continuous(labels=percent_format()) +
            pn.labs(title='Feature importance', x='Feature', y='Gain'))
    return plot
Пример #8
0
xax = axes[-1].xaxis
xax.set_ticklabels(lf(10.0**ex) for ex in xax.get_ticklocs())
plt.xlabel("growth rate [1/h]")
plt.ylabel("tradeoff")
plt.savefig("figures/dists.svg")
plt.close()

non_zero = (rates.groupby([
    "id", "tradeoff"
]).apply(lambda df: (df.growth_rate > 1e-6).sum() / df.shape[0]).reset_index(
    name="non_zero"))

pl = (ggplot(non_zero, aes(x="tradeoff", y="non_zero")) +
      geom_boxplot(outlier_color="none") +
      geom_jitter(width=0.15, height=0, alpha=0.5, stroke=0) +
      scale_y_continuous(labels=percent_format()) +
      labs(x="tradeoff", y="percent taxa growing"))
pl.save("figures/percent_growing.svg", width=5, height=5)

# Show some representative correlations
comp = both[(both.tradeoff == "0.5")]
w = within_samples[within_samples.tradeoff == "0.5"]
w.index = w.id
comp = comp[comp.id.isin(w.id[(w.n >= 10)])]
comp["rho"] = w.loc[comp.id, "rho"].round(2).values
comp.id = comp.id + " (r=" + comp.rho.astype(str) + ")"

pl = (ggplot(comp, aes(x="rate", y="growth_rate")) + geom_point() +
      facet_wrap("~ id", nrow=3) + scale_x_log10() + scale_y_log10() +
      labs(x="replication rate [a.u.]", y="predicted growth rate [1/h]"))
pl.save("figures/corr_examples.png", width=12, height=6, dpi=300)
Пример #9
0
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4)

di_notes = {
    'chi2': 'χ2-correction',
    'insig': 'Erroneous',
    'specification': 'Specification',
    'non-replicable': 'Inconsistent'
}
# (ii) Breakdown of counts
tmp = acc_tt.merge(
    res_fisher.tt.value_counts().reset_index().rename(columns={
        'index': 'tt',
        'tt': 'n_lit'
    }))
tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt),
                 notes=lambda x: x.notes.map(di_notes),
                 share=lambda x: x.n / x.n_lit)

gg_acc_notes = (
    pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() +
    pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) +
    pn.scale_fill_discrete(name='Literature') +
    pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) +
    pn.labs(y='Percent', x='Investigation') +
    pn.theme(axis_text_x=pn.element_text(angle=45),
             axis_title_x=pn.element_blank()))
gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'),
                  width=7,
                  height=3)

print('~~~ End of 4_results_insig.py ~~~')
Пример #10
0
def density_plot(df_original, variable, target=False, no_outliers=False, title=None, mean=True, mode=True, q1=True, q3=True):
    '''
    :param df: dataframe, variable: column to plot, target: object variable for split graph
    :return: ggplot graphs of univariate analysis
    by dtype: histogram, frequency, unique categories,
    :usage: after cleaning dataframe
    :target: is a string with the name of the categorical variable selected as target variable
    '''
    # target as str
    df = df_original.copy()
    df[target] = df[target].apply(str)
    
    x_label = variable.lower().replace("_", " ").title()
    graph = (ggplot(df) + aes(x=variable, y='..scaled..', fill=target))

    # target
    if target == False:
        graph += geom_density(fill=colors.FIRST_COLOR)

    else:
        fill_label = target.lower().replace("_", " ").title()
        graph += geom_density(alpha=.5)
        graph += scale_fill_manual(values=[colors.FIRST_COLOR, colors.SECOND_COLOR],
                                   name=fill_label)

    graph += (theme_bw()
              + theme(
                axis_line_x=element_line(color='gray'),
                axis_line_y=element_line(color='gray'),
                line=element_line(color='white')
            )
              )
    # labels
    graph += xlab(x_label) + ylab("Density")
    graph += scale_y_continuous(labels=percent_format())  # custom_format('{:.2f} USD')

    line_args = {
        "color": colors.THIRD_COLOR,
        "size": .5
    }

    var_describe = df[variable].describe()

    if mean:
        graph += geom_vline(xintercept=var_describe.loc["mean"],
                            linetype="dashed", **line_args
                            )

    if mode:
        graph += geom_vline(xintercept=var_describe.loc["50%"],
                            linetype="solid",
                            **line_args)

    if q1:
        graph += geom_vline(xintercept=var_describe.loc["25%"],
                            linetype="solid",
                            **line_args)

    if q3:
        graph += geom_vline(xintercept=var_describe.loc["75%"],
                            linetype="solid",
                            **line_args)

    # title
    if not title is None:
        graph += ggtitle(str(title))

    # no outliers
    if no_outliers:
        max_75 = df[variable].describe().loc['75%']
        min_25 = df[variable].describe().loc['25%']
        graph += xlim(min_25, max_75)

    # show
    graph.draw()
    plt.show()
Пример #11
0
    scale_x_datetime(date_breaks='5 years', date_labels='%Y') +
    scale_color_discrete(name='HPI', labels=['CREA', 'Teranet']))
gg_save('gg_tera_crea_lvl.png', dir_figures, gg_tera_crea_lvl, 12, 5)

# (iii) CREA vs Teranet m/m
tmp = df_hpi_both.groupby(
    ['city',
     'hpi']).apply(lambda x: x.value / x.value.shift(1) - 1).reset_index()
df_hpi_w = df_hpi_both.assign(mm=tmp.sort_values('level_2').value.values)
df_hpi_w = df_hpi_w.pivot_table('mm', ['date', 'city'], 'hpi').reset_index()
df_hpi_w = df_hpi_w.sort_values(['city', 'date']).reset_index(None, True)

gg_tera_crea_pct = (ggplot(df_hpi_w, aes(x='crea', y='tera')) +
                    geom_point(size=0.5) + theme_bw() +
                    theme(axis_text_x=element_text(angle=90)) +
                    scale_x_continuous(labels=percent_format()) +
                    scale_y_continuous(labels=percent_format()) +
                    labs(x='CREA', y='Teranet', title='month-over-month %') +
                    facet_wrap('~city', nrow=2))
gg_save('gg_tera_crea_pct.png', dir_figures, gg_tera_crea_pct, 12, 5)

# (iv) Find the optimal correlation
lag_seq = np.arange(13)
alpha = 0.1
n_bs = 1000
holder = []
for lag in lag_seq:
    print(lag)
    tmp_lag = df_hpi_w.assign(
        crea=df_hpi_w.groupby('city').crea.shift(lag)).dropna()
    tmp_lag_bs = tmp_lag.groupby('city').sample(frac=n_bs,