Exemplo n.º 1
0
def test_aesthetics():
    p = (ggplot(df, aes('x', 'y')) + geom_point() +
         geom_abline(aes(slope='slope', intercept='intercept'), size=2) +
         geom_abline(aes(slope='slope', intercept='intercept+.1', alpha='z'),
                     size=2) +
         geom_abline(aes(
             slope='slope', intercept='intercept+.2', linetype='factor(z)'),
                     size=2) +
         geom_abline(aes(
             slope='slope', intercept='intercept+.3', color='factor(z)'),
                     size=2) +
         geom_abline(aes(slope='slope', intercept='intercept+.4', size='z')))

    assert p + _theme == 'aesthetics'
def mixed_linear_factors_plot(df, x_axis, factor):
    plotnine.options.figure_size = (10, 10)
    factor_steps = df[factor].unique()
    reg_lines = pd.DataFrame({
        factor: factor_steps,
        'intercept': np.zeros_like(factor_steps),
        'slope': np.zeros_like(factor_steps)
    })
    for i, step in enumerate(factor_steps):
        factored_df = df[df[factor] == step]
        md = smf.mixedlm('mse ~ %s' % x_axis,
                         factored_df,
                         groups=factored_df.index.values)
        mdf = md.fit()
        reg_lines.iloc[i] = [step, mdf.params['Intercept'], mdf.params[x_axis]]

    df['percent_broken'] = df['percent_broken'].round().astype(np.int)
    df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int)
    reg_lines[factor] = reg_lines[factor].round().astype(np.int)
    gg = (
        plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) +
        plotnine.geom_jitter(width=2.5, show_legend=False) +
        plotnine.scale_color_manual(['#DB5F57'] * 4) +
        plotnine.facet_wrap(factor) + plotnine.geom_abline(
            plotnine.aes(intercept='intercept', slope='slope'),
            data=reg_lines) + plotnine.theme_classic(base_size=20))
    gg.save('%s_vs_%s_rmse.pdf' % (x_axis, factor))
Exemplo n.º 3
0
def scatter_plot(df,
                 xcol,
                 ycol,
                 domain,
                 xname=None,
                 yname=None,
                 log=False,
                 width=6,
                 height=6,
                 clamp=True,
                 tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 0.5
    DASH_PATTERN = (0, (3, 1))

    if xname == None:
        xname = xcol
    if yname == None:
        yname = ycol

    # formater for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df = df.copy(deep=True)
        df.loc[df[xcol] > domain[1], xcol] = domain[1]
        df.loc[df[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True)
    scatter += p9.labs(x=xname, y=yname)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    #scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(
        panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(figure_size=(width, height))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1,
                              linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1],
                             linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1],
                             linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Exemplo n.º 4
0
def test_aes_inheritance():
    # A default line (intercept = 0, slope = 1)
    p = (ggplot(
        df,
        aes('x', 'y', color='factor(z)', slope='slope', intercept='intercept'))
         + geom_point(size=10, show_legend=False) + geom_abline(size=2))

    assert p == 'aes_inheritance'
Exemplo n.º 5
0
def test_aes_inheritance():
    # A default line (intercept = 0, slope = 1)
    p = (ggplot(df, aes('x', 'y', color='factor(z)',
                        slope='slope', intercept='intercept')) +
         geom_point(size=10, show_legend=False) +
         geom_abline(size=2))

    assert p == 'aes_inheritance'
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5):
    assert len(domain) == 2

    POINT_SIZE = 1.5
    DASH_PATTERN = (0, (6, 2))

    if xname is None:
        xname = xcol
    if yname is None:
        yname = ycol

    # formatter for axes' labels
    ax_formatter = mizani.custom_format('{:n}')

    if clamp:  # clamp overflowing values if required
        df1 = df1.copy(deep=True)
        df1.loc[df1[xcol] > domain[1], xcol] = domain[1]
        df1.loc[df1[ycol] > domain[1], ycol] = domain[1]

        df2 = df2.copy(deep=True)
        df2.loc[df2[xcol] > domain[1], xcol] = domain[1]
        df2.loc[df2[ycol] > domain[1], ycol] = domain[1]

    # generate scatter plot
    scatter = p9.ggplot(df1)
    scatter += p9.aes(x=xcol, y=ycol)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5)
    scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5)
    scatter += p9.labs(x=xname, y=yname)

    # rug plots
    scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05)
    scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05)

    if log:  # log scale
        scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter)
    else:
        scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter)
        scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter)

    # scatter += p9.theme_xkcd()
    scatter += p9.theme_bw()
    scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
    scatter += p9.theme(panel_grid_minor=p9.element_blank())
    scatter += p9.theme(figure_size=(width, height))
    scatter += p9.theme(text=p9.element_text(size=24, color="black"))

    # generate additional lines
    scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN)  # diagonal
    scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN)  # vertical rule
    scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN)  # horizontal rule

    res = scatter

    return res
Exemplo n.º 7
0
def plot_compare(stats,
                 variant,
                 variant_baseline,
                 metric,
                 mode="identity",
                 jitter=0.01):
    assert mode in ["identity", "ratio", "difference"]
    plotdata = compare_stats(stats, variant, variant_baseline)
    bsw = bsw_table2(plotdata, metric=metric, reltol=1.0)
    display(bsw)
    baseline_name = f"{metric}_baseline"
    plotdata = plotdata[[metric, baseline_name, "dataset"]].assign(
        ratio=plotdata[metric] / plotdata[baseline_name],
        difference=plotdata[metric] - plotdata[baseline_name],
    )

    if mode == "identity":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y=metric, fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=1, intercept=0)))
    elif mode == "ratio":
        return (
            ggplot(data=plotdata) + geom_jitter(
                aes(x=f"{metric}_baseline", y="ratio", fill="dataset"),
                width=jitter,
                height=jitter,
            ) + scale_x_log10() + scale_y_log10()
            ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale
            + geom_abline(aes(slope=0, intercept=0.0)) +
            geom_abline(aes(slope=-1, intercept=0.0))  # max
        )
    elif mode == "difference":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y="difference", fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=0, intercept=0)))
    else:
        assert False, "unknown mode"
Exemplo n.º 8
0
def test_aesthetics():
    p = (ggplot(df, aes('x', 'y')) +
         geom_point() +
         geom_abline(
             aes(slope='slope', intercept='intercept'),
             size=2) +
         geom_abline(
             aes(slope='slope', intercept='intercept+.1', alpha='z'),
             size=2) +
         geom_abline(
             aes(slope='slope', intercept='intercept+.2',
                 linetype='factor(z)'),
             size=2) +
         geom_abline(
             aes(slope='slope', intercept='intercept+.3',
                 color='factor(z)'),
             size=2) +
         geom_abline(
             aes(slope='slope', intercept='intercept+.4', size='z')))

    assert p + _theme == 'aesthetics'
Exemplo n.º 9
0
def qq_plot(df, limit=20000):
    return (
        pn.ggplot(
            df
                .sort_values('P')
                .assign(OBS=lambda df: -np.log10(df['P']))
                .assign(EXP=lambda df: -np.log10(np.arange(1, len(df) + 1) / float(len(df))))
                .head(limit),
            pn.aes(x='EXP', y='OBS')
        ) + 
        pn.geom_point() + 
        pn.geom_abline() + 
        pn.theme_bw() 
    )
Exemplo n.º 10
0
def build_plot(ws: Points, i: int, j: int, p: int, q: int):
    df = DataFrame({
        'x': [w.real for w in ws],
        'y': [w.imag for w in ws],
        'include': [
            3 if k == p or k == q else
            -3 if i <= k and k <= j else
            0
            for k in range(len(ws))]});
    return (ggplot(df, aes('x', 'y', color='include'))
            + geom_point()
            + geom_abline(
                slope=slope(ws[p], ws[q]),
                intercept=intercept(ws[p], ws[q]))
            + lims(x=(0,1), y=(0,3), color=(-3.0, 3.0)))
Exemplo n.º 11
0
def plot_qq(df, color_var, facet_var=None, title=''):
    """
    Inspired by https://www.cureffi.org/2012/08/15/qq-plots-with-matplotlib/
    """
    # retrive pmin, the most significant (i.e. min) p value (for defining
    # the axes)
    axis_max = max(df['pval_neglog10'])

    if facet_var is None:
        pvals = df.groupby(
            by=color_var).apply(calculate_expected_pval).reset_index(
                level=color_var, drop=True)
    else:
        pvals = df.groupby(by=[color_var, facet_var]).apply(
            calculate_expected_pval).reset_index(level=[color_var, facet_var],
                                                 drop=True)

    # now plot these two arrays against each other
    n_colors = pvals[color_var].nunique()
    qqplot = plt9.ggplot(
        pvals,
        plt9.aes(x='expected_pval_neglog10',
                 y='pval_neglog10',
                 color=color_var))
    qqplot = qqplot + plt9.geom_point(size=0.1, alpha=0.25)
    qqplot = qqplot + plt9.geom_abline(
        slope=1, intercept=0, color='black', linetype='dashed')
    qqplot = qqplot + plt9.theme_bw()
    if n_colors < 9:
        qqplot = qqplot + plt9.scale_colour_brewer(palette='Dark2',
                                                   type='qual')
    qqplot = qqplot + plt9.labs(x='Expected (-log10 p-value)',
                                y='Observed (-log10 p-value)',
                                title=title,
                                color='')
    qqplot = qqplot + plt9.lims(x=(0, axis_max), y=(0, axis_max))
    if facet_var is not None:
        qqplot = qqplot + plt9.facet_wrap('~ {}'.format(facet_var), ncol=5)
    qqplot = qqplot + plt9.theme(strip_text=plt9.element_text(size=5),
                                 axis_text_x=plt9.element_text(angle=-45,
                                                               hjust=0))
    # set guide legend alpha to 1
    qqplot = qqplot + plt9.guides(color=plt9.guide_legend(override_aes={
        'size': 2.0,
        'alpha': 1.0
    }))
    return (qqplot)
Exemplo n.º 12
0
def gene_log_HR_plot(inFile, pcaFile=None, model=None):
    # get logHRs
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    t_logHR = par["means"]["logHR"][components, 0]
    tf_logHR = par["means"]["logHR"][tf_components, 0]

    t_logHR_sd = par["stds"]["logHR"][components, 0]
    tf_logHR_sd = par["stds"]["logHR"][tf_components, 0]

    # get pca
    if pcaFile is None:
        pcaFile = inFile.replace("_params.hdf5", "_pca.pkl")
    with open(pcaFile, "rb") as buff:
        pca = pickle.load(buff)

    # prep dataframe
    n_genes = pca.components_.shape[1]
    if model is None:
        logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)])
    else:
        logHR_df = pd.DataFrame(index=model.counts.index)
    logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR)
    logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR)
    logHR_df["tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0))
    logHR_df["non-tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0))
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2

    # make plot
    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) +
          pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() +
          pn.geom_point(alpha=0.3, color="red") + pn.geom_abline())
    return pl, logHR_df
    def residuals_fitted(self, figure_size=(4, 4), sample_frac=1.0):
        """Plot residuals against fitted values

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(4, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        return (
            ggplot(self.df.sample(frac=sample_frac), aes(
                x="yhat", y="residual")) + geom_point(alpha=0.25) +
            geom_abline(slope=0, intercept=0, color="red", linetype="dashed") +
            labs(title="Residuals vs fitted", x="Fitted", y="Residuals") +
            theme(figure_size=figure_size))
    def fitted_actual(self, figure_size=(4, 4), sample_frac=1.0):
        """Plot fitted values against actual values

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(4, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        return (
            ggplot(self.df.sample(frac=sample_frac), aes(x="y", y="yhat")) +
            geom_point(alpha=0.25) +
            geom_abline(slope=1, intercept=0, color="red", linetype="dashed") +
            labs(title="Fitted vs Actual (R2 = {:.3f})".format(self.r2_score),
                 x="Actual",
                 y="Fitted") + theme(figure_size=figure_size))
    def qq_plot(self, figure_size=(6, 4), sample_frac=1.0):
        """QQ plot of residuals

        Parameters
        ----------
        figure_size : tuple(int, int), optional default=(6, 4)
            Plot size (width, height)

        sample_frac : float, optional default=1.0
            Fraction of data points to plot

        Returns
        -------
        plot : ggplot object
        """
        # Normal distribution quantiles
        q = stats.norm.ppf([(x + 1) / (len(self.y) + 1)
                            for x in range(len(self.y))])

        # Get gradient and intercept of QQ line
        r_quantiles = np.quantile(self.df.residual, [0.25, 0.75])
        norm_quantiles = stats.norm.ppf([0.25, 0.75])
        qq_grad = (r_quantiles[1] - r_quantiles[0]) / (norm_quantiles[1] -
                                                       norm_quantiles[0])
        qq_int = r_quantiles[0] - qq_grad * norm_quantiles[0]

        # data frame to hold the plot data
        qq = pd.DataFrame(zip(self.df.residual.sort_values(ascending=True), q),
                          columns=["x", "norm_q"])

        return (
            ggplot(qq.sample(frac=sample_frac), aes(x="norm_q", y="x")) +
            geom_point(alpha=0.25) + geom_abline(intercept=qq_int,
                                                 slope=qq_grad,
                                                 color="red",
                                                 linetype="dashed") +
            labs(title="QQ Plot", x="Normal Quantiles", y="Sample Quantiles") +
            theme(figure_size=figure_size))
Exemplo n.º 16
0
def log_HR_plot(inFile, label_unit=10, log_scale_color=True):
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components])
    logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0]
    logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0]
    logHR_df["component"] = components
    logHR_df["label"] = [
        logHR_df.index[i] if i <= label_unit else "" for i in components
    ]
    logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0]
    logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0]
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2
    logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"])
    logHR_df["non-tumor -log10(p-value)"] = -np.log10(
        logHR_df["non-tumor p-value"])

    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(
        pn.aes(
            "non-tumor logHR",
            "tumor logHR",
            color="non-tumor p-value",
            fill="tumor p-value",
            label="label",
        ),
        logHR_df,
    ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() +
          pn.geom_point() + pn.theme_minimal() +
          pn.geom_text(ha="left", va="bottom", color="black"))
    if log_scale_color:
        pl += pn.scale_color_cmap(trans="log")
        pl += pn.scale_fill_cmap(trans="log")

    lb = min(
        logHR_df["non-tumor -log10(p-value)"].min(),
        logHR_df["tumor -log10(p-value)"].min(),
    )
    ub = max(
        logHR_df["non-tumor -log10(p-value)"].max(),
        logHR_df["tumor -log10(p-value)"].max(),
    )
    pl_p = (pn.ggplot(
        pn.aes(
            "non-tumor -log10(p-value)",
            "tumor -log10(p-value)",
            color="component",
            label="label",
        ),
        logHR_df,
    ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) +
            pn.theme_minimal() +
            pn.geom_text(ha="left", va="bottom", color="black"))
    return pl, pl_p, logHR_df
Exemplo n.º 17
0
def main():
    mpl.rc('mathtext', fontset='cm')

    warnings.filterwarnings('ignore',
                            r'(geom|position)_\w+ ?: Removed \d+ rows')
    warnings.filterwarnings('ignore', r'Saving .+ x .+ in image')
    warnings.filterwarnings('ignore', r'Filename: .+\.png')

    df = concat_map(Pf_Ob_Ol, 'P_f', np.linspace(0.1, 1, 10))
    save_both(my_plot(df, 'O_b', 'O_l', 'P_f')
              + titles('P_f(O_b, O_l)')
              + limits((1, 10))
              + gg.geom_abline(slope=1, intercept=0,
                               linetype='dashed', color='grey')
              + gg.geom_line()
              , 'Pf_Ob_Ol')

    df = concat_map(Pf_Ob_σ, 'P_f', np.linspace(0.1, 1, 10))
    save_both(my_plot(df, 'O_b', 'σ', 'P_f')
              + titles('P_f(O_b, σ)')
              + limits((1, 10), (0, 5))
              + gg.geom_line()
              , 'Pf_Ob_σ')

    df = concat_map(Pq_Ob_Ol, 'P_q', np.linspace(-0.9, 0, 10))
    save_both(my_plot(df, 'O_b', 'O_l', 'P_q')
              + titles('P_q(O_b, O_l)')
              + limits((1, 10))
              + gg.geom_abline(slope=1, intercept=0,
                               linetype='dashed', color='grey')
              + gg.geom_line()
              , 'Pq_Ob_Ol')

    df = concat_map(Pq_Ob_σ, 'P_q', np.linspace(-0.9, 0, 10))
    save_both(my_plot(df, 'O_b', 'σ', 'P_q')
              + titles('P_q(O_b, σ)')
              + limits((1, 10), (0, 5))
              + gg.geom_line()
              , 'Pq_Ob_σ')

    df = concat_map(Opr_Ob_Ol, 'Opr', np.linspace(1, 5, 9))
    save_both(my_plot(df, 'O_b', 'O_l', 'Opr')
              + titles("O'(O_b, O_l)")
              + limits((1, 10), (1, 10))
              + gg.geom_line()
              + gg.geom_abline(slope=1, intercept=0,
                               linetype='dashed', color='grey')
              , 'Opr_Ob_Ol')

    df = concat_map(Opr_Ob_σ, 'Opr', np.linspace(1, 5, 9))
    save_both(my_plot(df, 'O_b', 'σ', 'Opr')
              + titles("O'(O_b, σ)")
              + limits((1, 10), (0, 5))
              + gg.geom_line()
              , 'Opr_Ob_σ')

    df = (pd.DataFrame({'Opr': np.linspace(1, 21, 101)})
            .assign(Pf=lambda x: Opr_Pf(x.Opr)))
    save_both(my_plot(df, 'Opr', 'Pf')
              + titles("P_f(O')")
              + labs("O'", 'P_f')
              + limits((1, 20), (0, 1),
                       xbreaks=np.linspace(2, 20, 10),
                       ybreaks=np.linspace(0, 1, 11))
              + gg.geom_line()
              + gg.geom_hline(yintercept=C, linetype='dashed', color='grey')
              , 'Pf_Opr')

    df = concat_map(σpr_Ob_σ, 'σpr', np.linspace(0, 5, 11))
    save_both(my_plot(df, 'O_b', 'σ', 'σpr')
              + titles("σ'(O_b, σ)")
              + limits((1, 10), (0, 5))
              + gg.geom_line()
              , 'σpr_Ob_σ')

    df = (pd.DataFrame({'σpr': np.linspace(0, 21, 106)})
            .assign(Pq=lambda x: σpr_Pq(x.σpr)))
    save_both(my_plot(df, 'σpr', 'Pq')
              + titles("P_q(σ')")
              + labs("σ'", 'P_q')
              + limits((0, 20), (-1, 0),
                       xbreaks=np.linspace(0, 20, 11),
                       ybreaks=np.linspace(-1, 0, 11))
              + gg.geom_line()
              , 'Pq_σpr')

    df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11))
    save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}')
              + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02",
                       mathrm('Free bet', dollars=False))
              + limits((1,20), (1, 10))
              + gg.geom_line()
              + gg.geom_abline(slope=1, intercept=0,
                               linetype='dashed', color='grey')
              , 'liab_Ob_Ol_free')

    df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11))
    save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}')
              + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02",
                       mathrm('Free bet', dollars=False))
              + limits((1,20), (1, 10))
              + gg.geom_line()
              , 'liab_Ob_σ_free')

    df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11))
    save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}')
              + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02",
                       mathrm('Qualifying bet', dollars=False))
              + limits((1,20), (1, 10))
              + gg.geom_line()
              + gg.geom_abline(slope=1, intercept=0,
                               linetype='dashed', color='grey')
              , 'liab_Ob_Ol_qual')

    df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11))
    save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}')
              + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02",
                       mathrm('Qualifying bet', dollars=False))
              + limits((1,20), (1, 10))
              + gg.geom_line()
              , 'liab_Ob_σ_qual')

    df_Pf = Pf_Ob_σ(0.6).assign(profit=dollars('P_f'))
    df_Pq = Pq_Ob_σ(-0.3).assign(profit=dollars('P_q'))
    df = pd.concat((df_Pf, df_Pq), ignore_index=True)
    df.drop_duplicates('O_b', inplace=True)

    Opr = df_Pf.query('σ==0').O_b[0]
    σpr = df_Pq.query('O_b==1').σ[0]

    labels = pd.DataFrame({
        'x': [Opr+0.1, 1, 9.8], 'y': [4.8, σpr, σpr + 0.3],
        'label': ["$O'$", "$σ'$", mathrm('More profit')]
    })
    lab_aes = gg.aes('x', 'y', label='label')

    save_both(
        gg.ggplot(df, gg.aes(x='O_b', y='σ'))
        + gg.geom_area(gg.aes(fill='profit'), alpha=0.3)
        + gg.geom_vline(xintercept=Opr, linetype='dashed')
        + gg.geom_hline(yintercept=σpr, linetype='dashed')

        # text alignment can't be specified in an aes
        + gg.geom_text(lab_aes, data=labels.ix[:0], ha='left', va='top')
        + gg.geom_text(lab_aes, data=labels.ix[1:1], ha='left', va='bottom')
        + gg.geom_text(lab_aes, data=labels.ix[2:], ha='right', va='bottom')

        + gg.scale_fill_discrete(name=mathrm('Bet type'),
                                 labels=[mathrm('Free'), mathrm('Qualifying')])
        + limits((1, 10), (0, 5))
        + gg.ggtitle('%s "%s" %s' % (mathrm('Shape of the'),
                                     mathrm('more profitable'),
                                     mathrm('space')))
        + labs('O_b', 'σ')
        , 'Px_shapes')
Exemplo n.º 18
0
def histogram2d(
    data: pd.DataFrame,
    column1: str,
    column2: str,
    fig: plt.Figure = None,
    ax: plt.Axes = None,
    fig_width: int = 6,
    fig_height: int = 6,
    trend_line: str = "auto",
    lower_quantile1: float = 0,
    upper_quantile1: float = 1,
    lower_quantile2: float = 0,
    upper_quantile2: float = 1,
    transform1: str = "identity",
    transform2: str = "identity",
    equalize_axes: bool = False,
    reference_line: bool = False,
    plot_density: bool = False,
) -> Tuple[plt.Figure, plt.Axes, p9.ggplot]:
    """
    Creates an EDA plot for two continuous variables.

    Args:
        data: pandas DataFrame containing data to be plotted
        column1: name of column to plot on the x axis
        column2: name of column to plot on the y axis
        fig: matplotlib Figure generated from blank ggplot to plot onto. If specified, must also specify ax
        ax: matplotlib axes generated from blank ggplot to plot onto. If specified, must also specify fig
        fig_width: figure width in inches
        fig_height: figure height in inches
        trend_line: Trend line to plot over data. Default is to plot no trend line. Other options are passed
            to `geom_smooth <https://plotnine.readthedocs.io/en/stable/generated/plotnine.geoms.geom_smooth.html>`_.
        lower_quantile1: Lower quantile of column1 data to remove before plotting for ignoring outliers
        upper_quantile1: Upper quantile of column1 data to remove before plotting for ignoring outliers
        lower_quantile2: Lower quantile of column2 data to remove before plotting for ignoring outliers
        upper_quantile2: Upper quantile of column2 data to remove before plotting for ignoring outliers
        transform1: Transformation to apply to the column1 data for plotting:

         - **'identity'**: no transformation
         - **'log'**: apply a logarithmic transformation with small constant added in case of zero values
         - **'log_exclude0'**: apply a logarithmic transformation with zero values removed
         - **'sqrt'**: apply a square root transformation
        transform2: Transformation to apply to the column2 data for plotting. Same options as for column1.
        equalize_axes: Square the aspect ratio and match the axis limits
        reference_line: Add a y = x reference line
        plot_density: Overlay a 2d density on the given plot

    Returns:
        Tuple containing matplotlib figure and axes along with the plotnine ggplot object

    Examples:
        .. plot::

            import pandas as pd
            import intedact
            data = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-09-11/cats_vs_dogs.csv")
            intedact.histogram2d(data, 'n_dog_households', 'n_cat_households', equalize_axes=True, reference_line=True);
    """
    data = trim_quantiles(data,
                          column1,
                          lower_quantile=lower_quantile1,
                          upper_quantile=upper_quantile1)
    data = trim_quantiles(data,
                          column2,
                          lower_quantile=lower_quantile2,
                          upper_quantile=upper_quantile2)
    data = preprocess_transformations(data, column1, transform=transform1)
    data = preprocess_transformations(data, column2, transform=transform2)

    # draw the scatterplot
    gg = p9.ggplot(data, p9.aes(x=column1, y=column2)) + p9.geom_bin2d()

    # overlay density
    if plot_density:
        gg += p9.geom_density_2d()

    # add reference line
    if reference_line:
        gg += p9.geom_abline(color="black")

    # add trend line
    if trend_line != "none":
        gg += p9.geom_smooth(method=trend_line, color="red")

    gg += p9.labs(fill="")

    # handle axes transforms
    gg, xlabel = transform_axis(gg, column1, transform1, xaxis=True)
    gg, ylabel = transform_axis(gg, column2, transform2, xaxis=False)

    if fig is None and ax is None:
        gg.draw()
        fig = plt.gcf()
        ax = fig.axes[0]
    else:
        _ = gg._draw_using_figure(fig, [ax])

    if equalize_axes:
        fig, ax, gg = match_axes(fig, ax, gg)
        fig.set_size_inches(fig_width, fig_width)
    else:
        fig.set_size_inches(fig_width, fig_height)

    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    return fig, ax, gg
Exemplo n.º 19
0
beauty=pd.read_csv("beauty.csv")

dane=pd.read_csv("beauty.csv")
print(len(dane))

figures = []

piekno = "btystdave"

results = smf.ols("courseevaluation" +"~btystdave", data=dane).fit()
wyn=results.params

fig1=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane)
      +p9.geom_jitter(width=0.1)
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn["btystdave"])))
print(fig1)
figures.append(fig1)

df2=beauty
df2['y_pred']=results.predict()
df2['residuals']=df2['courseevaluation']-df2['y_pred']
fig2_res=(p9.ggplot(p9.aes(x='btystdave',y='residuals'),data=beauty)
      +p9.geom_point())
print(fig2_res)
figures.append(fig2_res)

results = smf.ols("courseevaluation" +"~btystdavepos + btystdave", data=dane).fit()
wyn=results.params

fig2=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane)
def mixed_linear_plots(df, x_axis, x_label):
    plotnine.options.figure_size = (8, 10)

    md = smf.mixedlm('log_score ~ percent_broken + percent_fail_runs',
                     df,
                     groups=df.index.values)
    mdf_rul = md.fit()

    print('#' * 18 + 'Log RUL' + '#' * 18)
    print(mdf_rul.summary())

    md = smf.mixedlm('mse ~ percent_broken + percent_fail_runs',
                     df,
                     groups=df.index.values)
    mdf_mse = md.fit()

    print('#' * 18 + 'RMSE' + '#' * 18)
    print(mdf_mse.summary())

    df['percent_broken'] = df['percent_broken'].round().astype(np.int)
    df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int)

    gg = (plotnine.ggplot(
        df, plotnine.aes(x=x_axis, y='log_score', color='method')) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.geom_abline(
              plotnine.aes(intercept=mdf_rul.params['Intercept'],
                           slope=mdf_rul.params[x_axis])) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN'])
          + plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul_by_method.pdf' % x_axis)

    gg = (plotnine.ggplot(
        df, plotnine.aes(x=x_axis, y='log_score', color='task')) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.geom_abline(
              plotnine.aes(intercept=mdf_rul.params['Intercept'],
                           slope=mdf_rul.params[x_axis])) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.scale_color_discrete(
              name='Task',
              labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1'
                      ]) + plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul_by_task.pdf' % x_axis)

    gg = (
        plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) +
        plotnine.geom_jitter(width=2.5) + plotnine.geom_abline(
            plotnine.aes(intercept=mdf_mse.params['Intercept'],
                         slope=mdf_mse.params[x_axis])) +
        plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') +
        plotnine.xlab(x_label) +
        plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) +
        plotnine.theme_classic(base_size=20))
    gg.save('%s_mse_by_method.pdf' % x_axis)

    gg = (plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='task')) +
          plotnine.geom_jitter(width=2.5) + plotnine.geom_abline(
              plotnine.aes(intercept=mdf_mse.params['Intercept'],
                           slope=mdf_mse.params[x_axis])) +
          plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') +
          plotnine.scale_color_discrete(
              name='Task',
              labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1'
                      ]) + plotnine.theme_classic(base_size=20))
    gg.save('%s_mse_by_task.pdf' % x_axis)
def plot_predictions_actual(pred_df, figsize):
    return (pn.ggplot(pred_df, pn.aes(x='y', y='pred')) + pn.geom_point() +
            pn.geom_ribbon(pn.aes(ymin='lb', ymax='ub'), alpha=0.3) +
            pn.geom_abline(slope=1, intercept=0) + pn.theme_bw() +
            pn.theme(figure_size=figsize))
Exemplo n.º 22
0

color_map = {
    "before": mcolors.to_hex(pd.np.array([178,223,138, 255])/255),
    "after": mcolors.to_hex(pd.np.array([31,120,180, 255])/255)
}


# In[14]:


g = (
    p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration"))
    + p9.geom_point()
    + p9.geom_path()
    + p9.geom_abline(p9.aes(slope=1, intercept=0), linetype='dashed', color='black')
    + p9.scale_color_manual(values={
        "before":color_map["before"],
        "after":color_map["after"]
    })
    + p9.facet_wrap("relation")
    + p9.labs(
        x="Predicted",
        y="Actual"
    )
    + p9.guides(color=p9.guide_legend(title="Model Calibration"))
    + p9.theme_bw()
)
print(g)
g.save(filename="../model_calibration.png", dpi=300)
Exemplo n.º 23
0
def test_non_mapped_facetting():
    p = (g
         + geom_abline(intercept=0, slope=1, size=1)
         + facet_wrap('var1')
         )
    assert p == 'non_mapped_facetting'
Exemplo n.º 24
0
def test_aes_overwrite():
    with pytest.warns(PlotnineWarning):
        geom_abline(aes(intercept='y'), intercept=2)
Exemplo n.º 25
0
def test_non_mapped_facetting():
    p = (g
         + geom_abline(intercept=0, slope=1, size=1)
         + facet_wrap('var1')
         )
    assert p == 'non_mapped_facetting'
Exemplo n.º 26
0
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import plotnine as p9


df = pd.read_csv('exercise.csv')

results = smf.ols('y ~ x1 + x2 + x1*x2', data=df).fit()

wyn=results.params

print(results.summary())


fig1=(p9.ggplot(p9.aes(x='x1,x2',y='y'),data=df)+p9.geom_jitter(width=0.1)
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn['x1'])))


plt.show()
)


# In[14]:


from sklearn.calibration import calibration_curve
cnn_y, cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.uncal, n_bins=10)
all_cnn_y, all_cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.cal, n_bins=10)

calibration_df = pd.DataFrame.from_records(
    list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'before'}, zip(cnn_x, cnn_y)))
    + list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'after'}, zip(all_cnn_x, all_cnn_y)))
)
calibration_df.to_csv("output/dag_calibration.tsv", sep="\t", index=False)


# In[15]:


(
    p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration"))
    + p9.geom_point()
    + p9.geom_line(p9.aes(group="factor(model_calibration)"))
    + p9.geom_abline(intercept=0, slope=1, linetype='dashed')
    + p9.scale_y_continuous(limits=[0,1])
    + p9.scale_x_continuous(limits=[0,1])
    + p9.theme_bw()
)