def test_aesthetics(): p = (ggplot(df, aes('x', 'y')) + geom_point() + geom_abline(aes(slope='slope', intercept='intercept'), size=2) + geom_abline(aes(slope='slope', intercept='intercept+.1', alpha='z'), size=2) + geom_abline(aes( slope='slope', intercept='intercept+.2', linetype='factor(z)'), size=2) + geom_abline(aes( slope='slope', intercept='intercept+.3', color='factor(z)'), size=2) + geom_abline(aes(slope='slope', intercept='intercept+.4', size='z'))) assert p + _theme == 'aesthetics'
def mixed_linear_factors_plot(df, x_axis, factor): plotnine.options.figure_size = (10, 10) factor_steps = df[factor].unique() reg_lines = pd.DataFrame({ factor: factor_steps, 'intercept': np.zeros_like(factor_steps), 'slope': np.zeros_like(factor_steps) }) for i, step in enumerate(factor_steps): factored_df = df[df[factor] == step] md = smf.mixedlm('mse ~ %s' % x_axis, factored_df, groups=factored_df.index.values) mdf = md.fit() reg_lines.iloc[i] = [step, mdf.params['Intercept'], mdf.params[x_axis]] df['percent_broken'] = df['percent_broken'].round().astype(np.int) df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int) reg_lines[factor] = reg_lines[factor].round().astype(np.int) gg = ( plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.scale_color_manual(['#DB5F57'] * 4) + plotnine.facet_wrap(factor) + plotnine.geom_abline( plotnine.aes(intercept='intercept', slope='slope'), data=reg_lines) + plotnine.theme_classic(base_size=20)) gg.save('%s_vs_%s_rmse.pdf' % (x_axis, factor))
def scatter_plot(df, xcol, ycol, domain, xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 0.5 DASH_PATTERN = (0, (3, 1)) if xname == None: xname = xcol if yname == None: yname = ycol # formater for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df = df.copy(deep=True) df.loc[df[xcol] > domain[1], xcol] = domain[1] df.loc[df[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True) scatter += p9.labs(x=xname, y=yname) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) #scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme( panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(figure_size=(width, height)) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def test_aes_inheritance(): # A default line (intercept = 0, slope = 1) p = (ggplot( df, aes('x', 'y', color='factor(z)', slope='slope', intercept='intercept')) + geom_point(size=10, show_legend=False) + geom_abline(size=2)) assert p == 'aes_inheritance'
def test_aes_inheritance(): # A default line (intercept = 0, slope = 1) p = (ggplot(df, aes('x', 'y', color='factor(z)', slope='slope', intercept='intercept')) + geom_point(size=10, show_legend=False) + geom_abline(size=2)) assert p == 'aes_inheritance'
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 1.5 DASH_PATTERN = (0, (6, 2)) if xname is None: xname = xcol if yname is None: yname = ycol # formatter for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df1 = df1.copy(deep=True) df1.loc[df1[xcol] > domain[1], xcol] = domain[1] df1.loc[df1[ycol] > domain[1], ycol] = domain[1] df2 = df2.copy(deep=True) df2.loc[df2[xcol] > domain[1], xcol] = domain[1] df2.loc[df2[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df1) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5) scatter += p9.labs(x=xname, y=yname) # rug plots scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05) scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) # scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(panel_grid_minor=p9.element_blank()) scatter += p9.theme(figure_size=(width, height)) scatter += p9.theme(text=p9.element_text(size=24, color="black")) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def plot_compare(stats, variant, variant_baseline, metric, mode="identity", jitter=0.01): assert mode in ["identity", "ratio", "difference"] plotdata = compare_stats(stats, variant, variant_baseline) bsw = bsw_table2(plotdata, metric=metric, reltol=1.0) display(bsw) baseline_name = f"{metric}_baseline" plotdata = plotdata[[metric, baseline_name, "dataset"]].assign( ratio=plotdata[metric] / plotdata[baseline_name], difference=plotdata[metric] - plotdata[baseline_name], ) if mode == "identity": return (ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y=metric, fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() + geom_abline(aes(slope=1, intercept=0))) elif mode == "ratio": return ( ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y="ratio", fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale + geom_abline(aes(slope=0, intercept=0.0)) + geom_abline(aes(slope=-1, intercept=0.0)) # max ) elif mode == "difference": return (ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y="difference", fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() + geom_abline(aes(slope=0, intercept=0))) else: assert False, "unknown mode"
def test_aesthetics(): p = (ggplot(df, aes('x', 'y')) + geom_point() + geom_abline( aes(slope='slope', intercept='intercept'), size=2) + geom_abline( aes(slope='slope', intercept='intercept+.1', alpha='z'), size=2) + geom_abline( aes(slope='slope', intercept='intercept+.2', linetype='factor(z)'), size=2) + geom_abline( aes(slope='slope', intercept='intercept+.3', color='factor(z)'), size=2) + geom_abline( aes(slope='slope', intercept='intercept+.4', size='z'))) assert p + _theme == 'aesthetics'
def qq_plot(df, limit=20000): return ( pn.ggplot( df .sort_values('P') .assign(OBS=lambda df: -np.log10(df['P'])) .assign(EXP=lambda df: -np.log10(np.arange(1, len(df) + 1) / float(len(df)))) .head(limit), pn.aes(x='EXP', y='OBS') ) + pn.geom_point() + pn.geom_abline() + pn.theme_bw() )
def build_plot(ws: Points, i: int, j: int, p: int, q: int): df = DataFrame({ 'x': [w.real for w in ws], 'y': [w.imag for w in ws], 'include': [ 3 if k == p or k == q else -3 if i <= k and k <= j else 0 for k in range(len(ws))]}); return (ggplot(df, aes('x', 'y', color='include')) + geom_point() + geom_abline( slope=slope(ws[p], ws[q]), intercept=intercept(ws[p], ws[q])) + lims(x=(0,1), y=(0,3), color=(-3.0, 3.0)))
def plot_qq(df, color_var, facet_var=None, title=''): """ Inspired by https://www.cureffi.org/2012/08/15/qq-plots-with-matplotlib/ """ # retrive pmin, the most significant (i.e. min) p value (for defining # the axes) axis_max = max(df['pval_neglog10']) if facet_var is None: pvals = df.groupby( by=color_var).apply(calculate_expected_pval).reset_index( level=color_var, drop=True) else: pvals = df.groupby(by=[color_var, facet_var]).apply( calculate_expected_pval).reset_index(level=[color_var, facet_var], drop=True) # now plot these two arrays against each other n_colors = pvals[color_var].nunique() qqplot = plt9.ggplot( pvals, plt9.aes(x='expected_pval_neglog10', y='pval_neglog10', color=color_var)) qqplot = qqplot + plt9.geom_point(size=0.1, alpha=0.25) qqplot = qqplot + plt9.geom_abline( slope=1, intercept=0, color='black', linetype='dashed') qqplot = qqplot + plt9.theme_bw() if n_colors < 9: qqplot = qqplot + plt9.scale_colour_brewer(palette='Dark2', type='qual') qqplot = qqplot + plt9.labs(x='Expected (-log10 p-value)', y='Observed (-log10 p-value)', title=title, color='') qqplot = qqplot + plt9.lims(x=(0, axis_max), y=(0, axis_max)) if facet_var is not None: qqplot = qqplot + plt9.facet_wrap('~ {}'.format(facet_var), ncol=5) qqplot = qqplot + plt9.theme(strip_text=plt9.element_text(size=5), axis_text_x=plt9.element_text(angle=-45, hjust=0)) # set guide legend alpha to 1 qqplot = qqplot + plt9.guides(color=plt9.guide_legend(override_aes={ 'size': 2.0, 'alpha': 1.0 })) return (qqplot)
def gene_log_HR_plot(inFile, pcaFile=None, model=None): # get logHRs par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) t_logHR = par["means"]["logHR"][components, 0] tf_logHR = par["means"]["logHR"][tf_components, 0] t_logHR_sd = par["stds"]["logHR"][components, 0] tf_logHR_sd = par["stds"]["logHR"][tf_components, 0] # get pca if pcaFile is None: pcaFile = inFile.replace("_params.hdf5", "_pca.pkl") with open(pcaFile, "rb") as buff: pca = pickle.load(buff) # prep dataframe n_genes = pca.components_.shape[1] if model is None: logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)]) else: logHR_df = pd.DataFrame(index=model.counts.index) logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR) logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR) logHR_df["tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0)) logHR_df["non-tumor logHR sd"] = np.sqrt( np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0)) logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 # make plot lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_point(alpha=0.3, color="red") + pn.geom_abline()) return pl, logHR_df
def residuals_fitted(self, figure_size=(4, 4), sample_frac=1.0): """Plot residuals against fitted values Parameters ---------- figure_size : tuple(int, int), optional default=(4, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ return ( ggplot(self.df.sample(frac=sample_frac), aes( x="yhat", y="residual")) + geom_point(alpha=0.25) + geom_abline(slope=0, intercept=0, color="red", linetype="dashed") + labs(title="Residuals vs fitted", x="Fitted", y="Residuals") + theme(figure_size=figure_size))
def fitted_actual(self, figure_size=(4, 4), sample_frac=1.0): """Plot fitted values against actual values Parameters ---------- figure_size : tuple(int, int), optional default=(4, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ return ( ggplot(self.df.sample(frac=sample_frac), aes(x="y", y="yhat")) + geom_point(alpha=0.25) + geom_abline(slope=1, intercept=0, color="red", linetype="dashed") + labs(title="Fitted vs Actual (R2 = {:.3f})".format(self.r2_score), x="Actual", y="Fitted") + theme(figure_size=figure_size))
def qq_plot(self, figure_size=(6, 4), sample_frac=1.0): """QQ plot of residuals Parameters ---------- figure_size : tuple(int, int), optional default=(6, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ # Normal distribution quantiles q = stats.norm.ppf([(x + 1) / (len(self.y) + 1) for x in range(len(self.y))]) # Get gradient and intercept of QQ line r_quantiles = np.quantile(self.df.residual, [0.25, 0.75]) norm_quantiles = stats.norm.ppf([0.25, 0.75]) qq_grad = (r_quantiles[1] - r_quantiles[0]) / (norm_quantiles[1] - norm_quantiles[0]) qq_int = r_quantiles[0] - qq_grad * norm_quantiles[0] # data frame to hold the plot data qq = pd.DataFrame(zip(self.df.residual.sort_values(ascending=True), q), columns=["x", "norm_q"]) return ( ggplot(qq.sample(frac=sample_frac), aes(x="norm_q", y="x")) + geom_point(alpha=0.25) + geom_abline(intercept=qq_int, slope=qq_grad, color="red", linetype="dashed") + labs(title="QQ Plot", x="Normal Quantiles", y="Sample Quantiles") + theme(figure_size=figure_size))
def log_HR_plot(inFile, label_unit=10, log_scale_color=True): par = get_params(inFile) pca_components = par["means"]["logHR"].shape[0] >> 1 components = range(pca_components) tf_components = slice(pca_components, 2 * pca_components) logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components]) logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0] logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0] logHR_df["component"] = components logHR_df["label"] = [ logHR_df.index[i] if i <= label_unit else "" for i in components ] logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0] logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0] logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"] logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] / logHR_df["tumor logHR sd"]) logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2 logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2 logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"]) logHR_df["non-tumor -log10(p-value)"] = -np.log10( logHR_df["non-tumor p-value"]) lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min()) ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max()) pl = (pn.ggplot( pn.aes( "non-tumor logHR", "tumor logHR", color="non-tumor p-value", fill="tumor p-value", label="label", ), logHR_df, ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() + pn.geom_point() + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) if log_scale_color: pl += pn.scale_color_cmap(trans="log") pl += pn.scale_fill_cmap(trans="log") lb = min( logHR_df["non-tumor -log10(p-value)"].min(), logHR_df["tumor -log10(p-value)"].min(), ) ub = max( logHR_df["non-tumor -log10(p-value)"].max(), logHR_df["tumor -log10(p-value)"].max(), ) pl_p = (pn.ggplot( pn.aes( "non-tumor -log10(p-value)", "tumor -log10(p-value)", color="component", label="label", ), logHR_df, ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() + pn.geom_text(ha="left", va="bottom", color="black")) return pl, pl_p, logHR_df
def main(): mpl.rc('mathtext', fontset='cm') warnings.filterwarnings('ignore', r'(geom|position)_\w+ ?: Removed \d+ rows') warnings.filterwarnings('ignore', r'Saving .+ x .+ in image') warnings.filterwarnings('ignore', r'Filename: .+\.png') df = concat_map(Pf_Ob_Ol, 'P_f', np.linspace(0.1, 1, 10)) save_both(my_plot(df, 'O_b', 'O_l', 'P_f') + titles('P_f(O_b, O_l)') + limits((1, 10)) + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') + gg.geom_line() , 'Pf_Ob_Ol') df = concat_map(Pf_Ob_σ, 'P_f', np.linspace(0.1, 1, 10)) save_both(my_plot(df, 'O_b', 'σ', 'P_f') + titles('P_f(O_b, σ)') + limits((1, 10), (0, 5)) + gg.geom_line() , 'Pf_Ob_σ') df = concat_map(Pq_Ob_Ol, 'P_q', np.linspace(-0.9, 0, 10)) save_both(my_plot(df, 'O_b', 'O_l', 'P_q') + titles('P_q(O_b, O_l)') + limits((1, 10)) + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') + gg.geom_line() , 'Pq_Ob_Ol') df = concat_map(Pq_Ob_σ, 'P_q', np.linspace(-0.9, 0, 10)) save_both(my_plot(df, 'O_b', 'σ', 'P_q') + titles('P_q(O_b, σ)') + limits((1, 10), (0, 5)) + gg.geom_line() , 'Pq_Ob_σ') df = concat_map(Opr_Ob_Ol, 'Opr', np.linspace(1, 5, 9)) save_both(my_plot(df, 'O_b', 'O_l', 'Opr') + titles("O'(O_b, O_l)") + limits((1, 10), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'Opr_Ob_Ol') df = concat_map(Opr_Ob_σ, 'Opr', np.linspace(1, 5, 9)) save_both(my_plot(df, 'O_b', 'σ', 'Opr') + titles("O'(O_b, σ)") + limits((1, 10), (0, 5)) + gg.geom_line() , 'Opr_Ob_σ') df = (pd.DataFrame({'Opr': np.linspace(1, 21, 101)}) .assign(Pf=lambda x: Opr_Pf(x.Opr))) save_both(my_plot(df, 'Opr', 'Pf') + titles("P_f(O')") + labs("O'", 'P_f') + limits((1, 20), (0, 1), xbreaks=np.linspace(2, 20, 10), ybreaks=np.linspace(0, 1, 11)) + gg.geom_line() + gg.geom_hline(yintercept=C, linetype='dashed', color='grey') , 'Pf_Opr') df = concat_map(σpr_Ob_σ, 'σpr', np.linspace(0, 5, 11)) save_both(my_plot(df, 'O_b', 'σ', 'σpr') + titles("σ'(O_b, σ)") + limits((1, 10), (0, 5)) + gg.geom_line() , 'σpr_Ob_σ') df = (pd.DataFrame({'σpr': np.linspace(0, 21, 106)}) .assign(Pq=lambda x: σpr_Pq(x.σpr))) save_both(my_plot(df, 'σpr', 'Pq') + titles("P_q(σ')") + labs("σ'", 'P_q') + limits((0, 20), (-1, 0), xbreaks=np.linspace(0, 20, 11), ybreaks=np.linspace(-1, 0, 11)) + gg.geom_line() , 'Pq_σpr') df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Free bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'liab_Ob_Ol_free') df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Free bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() , 'liab_Ob_σ_free') df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Qualifying bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'liab_Ob_Ol_qual') df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Qualifying bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() , 'liab_Ob_σ_qual') df_Pf = Pf_Ob_σ(0.6).assign(profit=dollars('P_f')) df_Pq = Pq_Ob_σ(-0.3).assign(profit=dollars('P_q')) df = pd.concat((df_Pf, df_Pq), ignore_index=True) df.drop_duplicates('O_b', inplace=True) Opr = df_Pf.query('σ==0').O_b[0] σpr = df_Pq.query('O_b==1').σ[0] labels = pd.DataFrame({ 'x': [Opr+0.1, 1, 9.8], 'y': [4.8, σpr, σpr + 0.3], 'label': ["$O'$", "$σ'$", mathrm('More profit')] }) lab_aes = gg.aes('x', 'y', label='label') save_both( gg.ggplot(df, gg.aes(x='O_b', y='σ')) + gg.geom_area(gg.aes(fill='profit'), alpha=0.3) + gg.geom_vline(xintercept=Opr, linetype='dashed') + gg.geom_hline(yintercept=σpr, linetype='dashed') # text alignment can't be specified in an aes + gg.geom_text(lab_aes, data=labels.ix[:0], ha='left', va='top') + gg.geom_text(lab_aes, data=labels.ix[1:1], ha='left', va='bottom') + gg.geom_text(lab_aes, data=labels.ix[2:], ha='right', va='bottom') + gg.scale_fill_discrete(name=mathrm('Bet type'), labels=[mathrm('Free'), mathrm('Qualifying')]) + limits((1, 10), (0, 5)) + gg.ggtitle('%s "%s" %s' % (mathrm('Shape of the'), mathrm('more profitable'), mathrm('space'))) + labs('O_b', 'σ') , 'Px_shapes')
def histogram2d( data: pd.DataFrame, column1: str, column2: str, fig: plt.Figure = None, ax: plt.Axes = None, fig_width: int = 6, fig_height: int = 6, trend_line: str = "auto", lower_quantile1: float = 0, upper_quantile1: float = 1, lower_quantile2: float = 0, upper_quantile2: float = 1, transform1: str = "identity", transform2: str = "identity", equalize_axes: bool = False, reference_line: bool = False, plot_density: bool = False, ) -> Tuple[plt.Figure, plt.Axes, p9.ggplot]: """ Creates an EDA plot for two continuous variables. Args: data: pandas DataFrame containing data to be plotted column1: name of column to plot on the x axis column2: name of column to plot on the y axis fig: matplotlib Figure generated from blank ggplot to plot onto. If specified, must also specify ax ax: matplotlib axes generated from blank ggplot to plot onto. If specified, must also specify fig fig_width: figure width in inches fig_height: figure height in inches trend_line: Trend line to plot over data. Default is to plot no trend line. Other options are passed to `geom_smooth <https://plotnine.readthedocs.io/en/stable/generated/plotnine.geoms.geom_smooth.html>`_. lower_quantile1: Lower quantile of column1 data to remove before plotting for ignoring outliers upper_quantile1: Upper quantile of column1 data to remove before plotting for ignoring outliers lower_quantile2: Lower quantile of column2 data to remove before plotting for ignoring outliers upper_quantile2: Upper quantile of column2 data to remove before plotting for ignoring outliers transform1: Transformation to apply to the column1 data for plotting: - **'identity'**: no transformation - **'log'**: apply a logarithmic transformation with small constant added in case of zero values - **'log_exclude0'**: apply a logarithmic transformation with zero values removed - **'sqrt'**: apply a square root transformation transform2: Transformation to apply to the column2 data for plotting. Same options as for column1. equalize_axes: Square the aspect ratio and match the axis limits reference_line: Add a y = x reference line plot_density: Overlay a 2d density on the given plot Returns: Tuple containing matplotlib figure and axes along with the plotnine ggplot object Examples: .. plot:: import pandas as pd import intedact data = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-09-11/cats_vs_dogs.csv") intedact.histogram2d(data, 'n_dog_households', 'n_cat_households', equalize_axes=True, reference_line=True); """ data = trim_quantiles(data, column1, lower_quantile=lower_quantile1, upper_quantile=upper_quantile1) data = trim_quantiles(data, column2, lower_quantile=lower_quantile2, upper_quantile=upper_quantile2) data = preprocess_transformations(data, column1, transform=transform1) data = preprocess_transformations(data, column2, transform=transform2) # draw the scatterplot gg = p9.ggplot(data, p9.aes(x=column1, y=column2)) + p9.geom_bin2d() # overlay density if plot_density: gg += p9.geom_density_2d() # add reference line if reference_line: gg += p9.geom_abline(color="black") # add trend line if trend_line != "none": gg += p9.geom_smooth(method=trend_line, color="red") gg += p9.labs(fill="") # handle axes transforms gg, xlabel = transform_axis(gg, column1, transform1, xaxis=True) gg, ylabel = transform_axis(gg, column2, transform2, xaxis=False) if fig is None and ax is None: gg.draw() fig = plt.gcf() ax = fig.axes[0] else: _ = gg._draw_using_figure(fig, [ax]) if equalize_axes: fig, ax, gg = match_axes(fig, ax, gg) fig.set_size_inches(fig_width, fig_width) else: fig.set_size_inches(fig_width, fig_height) ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) return fig, ax, gg
beauty=pd.read_csv("beauty.csv") dane=pd.read_csv("beauty.csv") print(len(dane)) figures = [] piekno = "btystdave" results = smf.ols("courseevaluation" +"~btystdave", data=dane).fit() wyn=results.params fig1=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane) +p9.geom_jitter(width=0.1) +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn["btystdave"]))) print(fig1) figures.append(fig1) df2=beauty df2['y_pred']=results.predict() df2['residuals']=df2['courseevaluation']-df2['y_pred'] fig2_res=(p9.ggplot(p9.aes(x='btystdave',y='residuals'),data=beauty) +p9.geom_point()) print(fig2_res) figures.append(fig2_res) results = smf.ols("courseevaluation" +"~btystdavepos + btystdave", data=dane).fit() wyn=results.params fig2=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane)
def mixed_linear_plots(df, x_axis, x_label): plotnine.options.figure_size = (8, 10) md = smf.mixedlm('log_score ~ percent_broken + percent_fail_runs', df, groups=df.index.values) mdf_rul = md.fit() print('#' * 18 + 'Log RUL' + '#' * 18) print(mdf_rul.summary()) md = smf.mixedlm('mse ~ percent_broken + percent_fail_runs', df, groups=df.index.values) mdf_mse = md.fit() print('#' * 18 + 'RMSE' + '#' * 18) print(mdf_mse.summary()) df['percent_broken'] = df['percent_broken'].round().astype(np.int) df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int) gg = (plotnine.ggplot( df, plotnine.aes(x=x_axis, y='log_score', color='method')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.geom_abline( plotnine.aes(intercept=mdf_rul.params['Intercept'], slope=mdf_rul.params[x_axis])) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') + plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul_by_method.pdf' % x_axis) gg = (plotnine.ggplot( df, plotnine.aes(x=x_axis, y='log_score', color='task')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.geom_abline( plotnine.aes(intercept=mdf_rul.params['Intercept'], slope=mdf_rul.params[x_axis])) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') + plotnine.scale_color_discrete( name='Task', labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1' ]) + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul_by_task.pdf' % x_axis) gg = ( plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) + plotnine.geom_jitter(width=2.5) + plotnine.geom_abline( plotnine.aes(intercept=mdf_mse.params['Intercept'], slope=mdf_mse.params[x_axis])) + plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') + plotnine.xlab(x_label) + plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) + plotnine.theme_classic(base_size=20)) gg.save('%s_mse_by_method.pdf' % x_axis) gg = (plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='task')) + plotnine.geom_jitter(width=2.5) + plotnine.geom_abline( plotnine.aes(intercept=mdf_mse.params['Intercept'], slope=mdf_mse.params[x_axis])) + plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') + plotnine.scale_color_discrete( name='Task', labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1' ]) + plotnine.theme_classic(base_size=20)) gg.save('%s_mse_by_task.pdf' % x_axis)
def plot_predictions_actual(pred_df, figsize): return (pn.ggplot(pred_df, pn.aes(x='y', y='pred')) + pn.geom_point() + pn.geom_ribbon(pn.aes(ymin='lb', ymax='ub'), alpha=0.3) + pn.geom_abline(slope=1, intercept=0) + pn.theme_bw() + pn.theme(figure_size=figsize))
color_map = { "before": mcolors.to_hex(pd.np.array([178,223,138, 255])/255), "after": mcolors.to_hex(pd.np.array([31,120,180, 255])/255) } # In[14]: g = ( p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration")) + p9.geom_point() + p9.geom_path() + p9.geom_abline(p9.aes(slope=1, intercept=0), linetype='dashed', color='black') + p9.scale_color_manual(values={ "before":color_map["before"], "after":color_map["after"] }) + p9.facet_wrap("relation") + p9.labs( x="Predicted", y="Actual" ) + p9.guides(color=p9.guide_legend(title="Model Calibration")) + p9.theme_bw() ) print(g) g.save(filename="../model_calibration.png", dpi=300)
def test_non_mapped_facetting(): p = (g + geom_abline(intercept=0, slope=1, size=1) + facet_wrap('var1') ) assert p == 'non_mapped_facetting'
def test_aes_overwrite(): with pytest.warns(PlotnineWarning): geom_abline(aes(intercept='y'), intercept=2)
import pandas as pd import statsmodels.formula.api as smf import matplotlib.pyplot as plt import plotnine as p9 df = pd.read_csv('exercise.csv') results = smf.ols('y ~ x1 + x2 + x1*x2', data=df).fit() wyn=results.params print(results.summary()) fig1=(p9.ggplot(p9.aes(x='x1,x2',y='y'),data=df)+p9.geom_jitter(width=0.1) +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn['x1']))) plt.show()
) # In[14]: from sklearn.calibration import calibration_curve cnn_y, cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.uncal, n_bins=10) all_cnn_y, all_cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.cal, n_bins=10) calibration_df = pd.DataFrame.from_records( list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'before'}, zip(cnn_x, cnn_y))) + list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'after'}, zip(all_cnn_x, all_cnn_y))) ) calibration_df.to_csv("output/dag_calibration.tsv", sep="\t", index=False) # In[15]: ( p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration")) + p9.geom_point() + p9.geom_line(p9.aes(group="factor(model_calibration)")) + p9.geom_abline(intercept=0, slope=1, linetype='dashed') + p9.scale_y_continuous(limits=[0,1]) + p9.scale_x_continuous(limits=[0,1]) + p9.theme_bw() )