def estimate_cutoffs_plot(output_file, df_plt, df_cell_estimate_cutoff, df_fit=None, scale_x_log10=False, save_plot=True): """Plot UMI counts by sorted cell barcodes.""" if min(df_plt['umi_counts']) <= 0: fix_log_scale = min(df_plt['umi_counts']) + 1 df_plt['umi_counts'] = df_plt['umi_counts'] + fix_log_scale gplt = plt9.ggplot() gplt = gplt + plt9.theme_bw() if len(df_plt) <= 50000: gplt = gplt + plt9.geom_point(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.05, size=0.1) else: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.25, size=0.75, color='black') gplt = gplt + plt9.geom_vline(mapping=plt9.aes(xintercept='n_cells', color='method'), data=df_cell_estimate_cutoff, alpha=0.75, linetype='dashdot') gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual') if scale_x_log10: gplt = gplt + plt9.scale_x_continuous( trans='log10', labels=comma_labels, minor_breaks=0) else: gplt = gplt + plt9.scale_x_continuous(labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(title='', y='UMI counts', x='Barcode index, sorted by UMI count', color='Cutoff') # Add the fit of the droplet utils model if df_fit: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='x', y='y'), data=df_fit, alpha=1, color='yellow') if save_plot: gplt.save('{}.png'.format(output_file), dpi=300, width=5, height=4) return gplt
def test_wrong_bases(): # x axis not transformed p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # x axis not transform, but ticks requested for a different base p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + scale_x_continuous(trans=log_trans(8)) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # x axis is discrete df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('discrete', 'x')) + annotation_logticks(sides='b', size=.75, base=None) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # y axis is discrete df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('x', 'discrete')) + annotation_logticks(sides='l', size=.75, base=None) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test()
def test_annotation_stripes_continuous_transformed(): pdf = mtcars.assign(am=pd.Categorical(mtcars.am)) p = (ggplot(pdf) + annotation_stripes(fills=["red", "green", "blue"], alpha=0.1) + geom_jitter(aes("hp", "wt", color="am"), random_state=5) + scale_x_continuous(trans='log2')) assert p == "annotation_stripes_continuous_transformed"
def round_2_plot(): if not os.path.exists(round_2_df_path): eprint(f'Downloading {round_2_df_url} to {round_2_df_path}') urlretrieve(round_2_df_url, round_2_df_path) verify_checksum(round_2_df_checksum, round_2_df_path) df = pd.read_json(round_2_df_path) p = ( ggplot(df) + aes(x='char_percent', y='correct', color='Dataset') + facet_wrap('Guessing_Model', nrow=1) + stat_summary_bin( fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=[0, 0.7]) + ggtitle('Round 2 Attacks and Models') + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')) p.save('2019_tacl_trick/auto_fig/round_2_json.pdf', width=7.0, height=1.7)
def plot_time_curve_with_threshold(self): toplot = self.aggregated.melt( id_vars='hour', value_vars=['number_bacteria', 'number_actin'], value_name='counts', var_name='Object') colors = self.create_color_list() myfig = ( ggplot(toplot, aes("hour", "counts", color="Object")) + geom_point() + geom_line() + labels.xlab("Time [hours]") + labels.ylab("Average number of objects/nuclei") + pn.scale_colour_manual(values=colors, labels=list(self.sel_channel_time.value), name="") + pn.labs(colour="") + pn.scale_x_continuous( breaks=np.sort(self.result.hour.unique()), labels=list(np.sort(self.result.hour.unique()).astype(str)))) self.time_curve_fig = myfig self.out_plot2.clear_output() with self.out_plot2: display(myfig)
def plot_time_curve_by_channel(self, b=None): """Callback to polot time curve of number of bacteria/nuclei for each selected channel. Called by plot_time_curve_button.""" if self.aggregated is None: self.data_aggregation() if len(self.sel_channel_time.value) == 0: print("Select at least one channel") else: subset = self.aggregated[self.aggregated.channel.isin( self.sel_channel_time.value)].copy(deep=True) subset.loc[:, "channel"] = subset.channel.astype( pd.CategoricalDtype(self.sel_channel_time.value, ordered=True)) colors = self.create_color_list() myfig = ( ggplot(subset, aes("hour", "normalized", color="channel")) + geom_point() + geom_line() + labels.xlab("Time [hours]") + labels.ylab("Average number of bacteria/nuclei") + pn.scale_colour_manual( values=colors, labels=list(self.sel_channel_time.value), name="") + pn.labs(colour="") + pn.scale_x_continuous( breaks=np.sort(self.result.hour.unique()), labels=list( np.sort(self.result.hour.unique()).astype(str)))) self.time_curve_fig = myfig self.out_plot2.clear_output() with self.out_plot2: display(myfig)
def round_1_plot(): df = pd.read_csv('2019_tacl_trick/data/round_1.csv') model_dtype = CategoricalDtype(['DAN', 'RNN', 'IR'], ordered=True) df['Model'] = df['Model'].astype(model_dtype) # This following is a hack so that the legend widths are the same across plots def rename(x): if x == 'Round 1 - IR Adversarial': return 'Round 1 - IR Adversarial ' else: return x df['Dataset'] = df['Dataset'].map(rename) p = (ggplot(df) + aes(x='x', y='y', color='Dataset') + facet_wrap('Model', nrow=1) + geom_point(size=1.0, shape='o') + scale_y_continuous(breaks=np.linspace(0, 1, 6), limits=[0, 0.6]) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + ggtitle('Round 1 Attacks and Models') + theme(strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual( values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')) p.save('2019_tacl_trick/auto_fig/round_1_csv.pdf', width=7.0, height=1.7)
def plot_train_test(ags): frontiers = data.train_test(ags) frontiers, model = data.train_test_model(frontiers) labs = frontiers.sort_values('train_flops').groupby( 'elo').first().reset_index() desc = f'log₁₀(test) = {model.params[1]:.1f} · log₁₀(train) + {model.params[2]:.1g} · elo + {model.params[0]:.0f}' return ( pn.ggplot( frontiers, pn.aes(x='train_flops', y='test_flops', color='elo', group='elo')) + pn.geom_line(size=.5, show_legend=False) + pn.geom_line(pn.aes(y='test_flops_hat'), size=.25, show_legend=False, linetype='dashed') # + pn.geom_point(size=.5, show_legend=False) + pn.geom_text(pn.aes(label='elo.astype(int)'), labs, show_legend=False, size=6, nudge_y=+.2) + pn.scale_color_cmap(limits=(-1500, 0)) + pn.scale_x_continuous(trans='log10') + pn.scale_y_continuous(trans='log10') + pn.annotate( 'text', 1.5e13, 5e9, label=desc, ha='left', size=6, family='serif') + pn.labs(x='Train-time compute (FLOPS-seconds)', y='Test-time compute (FLOPS-seconds)') + plot.IEEE())
def scatter_plot(df, xcol, ycol, domain, xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 0.5 DASH_PATTERN = (0, (3, 1)) if xname == None: xname = xcol if yname == None: yname = ycol # formater for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df = df.copy(deep=True) df.loc[df[xcol] > domain[1], xcol] = domain[1] df.loc[df[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True) scatter += p9.labs(x=xname, y=yname) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) #scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme( panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(figure_size=(width, height)) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def limits(x, y=None, xbreaks=None, ybreaks=None): if y is None: y = x x0, x1 = x y0, y1 = y if xbreaks is None: xbreaks = np.linspace(x0, x1, x1 - x0 + 1) if ybreaks is None: ybreaks = np.linspace(y0, y1, y1 - y0 + 1) # We want these plots to continue to the top and left. return [ gg.coord_cartesian(xlim=x, ylim=y), gg.scale_x_continuous(limits=(x0, None), breaks=xbreaks), gg.scale_y_continuous(limits=(y0, None), breaks=ybreaks) ] return [ gg.scale_x_continuous(limits=x, breaks=xbreaks), gg.scale_y_continuous(limits=y, breaks=ybreaks) ]
def __plot( self, plot_data, x, y, colour, lbl_x, lbl_y, facet, facet_scales, facet_by, smoothed, points, error_bars, save, ): cbbPalette = [ "#000000", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", ] plt = ggplot(data=plot_data, mapping=aes(x=x, y=y, colour=colour)) plt += xlab(lbl_x) plt += ylab(lbl_y) # + facet_grid("site~", scales="free") # + geom_line() if facet: # TODO: use facet as save nrow, ncol = self.get_facet_rows(plot_data, facet_by) plt += facet_wrap(facet_by, nrow=nrow, ncol=ncol, scales=facet_scales) if points: plt += geom_point() if error_bars: # TODO use generic way to compute them pass # self.plt += geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) # TODO: use smooth as save if smoothed: plt += geom_smooth( method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}, ) else: plt += geom_line() plt += scale_colour_manual(values=cbbPalette, guide=False) plt += scale_x_continuous(labels=label_x) plt += theme(figure_size=(15, 18), dpi=150) if save: plt.save(**save) return plt
def plot_histogram(df_plot, variable_column, output_file='plot_distribution', facet_column='none', x_log10=False): """Plot plot_distribution to png. Parameters ---------- df_plot : pandas.DataFrame DataFrame with <variable_column> as a column. variable_column : string String of variable_column column to plot. output_file : string Basename of output file. facet_column : string Column to facet the plot by. Returns ------- NULL """ df_plot['x'] = df_plot[variable_column] if x_log10: if np.any(df_plot['x'].values < 0): return 1 elif np.any(df_plot['x'].values == 0): df_plot['x'] = np.log10(df_plot['x'].values + 1e-10) variable_column = variable_column + ' (log10)' else: df_plot['x'] = np.log10(df_plot['x'].values) variable_column = variable_column + ' (log10)' gplt = plt9.ggplot(df_plot, plt9.aes(x='x')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_histogram(alpha=0.8) gplt = gplt + plt9.scale_x_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(title='', x=variable_column) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=-45, hjust=0)) if facet_column != 'none': gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5) n_facets = df_plot[facet_column].nunique() gplt.save('{}.png'.format(output_file), dpi=300, width=6 * (n_facets / 4), height=4 * (n_facets / 4), limitsize=False) else: gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4) return 0
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 1.5 DASH_PATTERN = (0, (6, 2)) if xname is None: xname = xcol if yname is None: yname = ycol # formatter for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df1 = df1.copy(deep=True) df1.loc[df1[xcol] > domain[1], xcol] = domain[1] df1.loc[df1[ycol] > domain[1], ycol] = domain[1] df2 = df2.copy(deep=True) df2.loc[df2[xcol] > domain[1], xcol] = domain[1] df2.loc[df2[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df1) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5) scatter += p9.labs(x=xname, y=yname) # rug plots scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05) scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) # scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(panel_grid_minor=p9.element_blank()) scatter += p9.theme(figure_size=(width, height)) scatter += p9.theme(text=p9.element_text(size=24, color="black")) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def test_annotation_logticks_base_8(): base = 8 df = pd.DataFrame({'x': base**np.arange(4)}) # The grid should align with the logticks p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) + geom_point() + scale_x_continuous(trans=log_trans(base=base)) + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_base_8'
def plot_optimal_model_size(ags): from statsmodels.formula import api as smf results = {} for b, g in ags.groupby('boardsize'): ordered = g.sort_values('elo').copy() ordered['params'] = g.width**2 * g.depth left = np.log10(g.train_flops.min()) right = np.log10(g.train_flops.max()) for f in np.linspace(left, right, 11)[1:]: subset = ordered[ordered.train_flops <= 10**f] results[b, 10**f] = subset.params.iloc[-1] df = pd.Series(results).reset_index() df.columns = ['boardsize', 'approx_flops', 'params'] model = smf.ols('np.log10(params) ~ np.log10(approx_flops) + 1', df).fit() left, right = np.log10(df.approx_flops.min()), np.log10( df.approx_flops.max()) preds = pd.DataFrame({'approx_flops': 10**np.linspace(left, right, 21)}) preds['params'] = 10**model.predict(preds) labs = df.sort_values('approx_flops').groupby( 'boardsize').last().reset_index() labs['params'] = labs.apply( lambda r: df[df.approx_flops <= r.approx_flops].params.max(), axis=1) points = df.sort_values('approx_flops').groupby( 'boardsize').last().reset_index() desc = f'log₁₀(params) = {model.params[1]:.2f} · log₁₀(compute) − {-model.params[0]:.1f}' return ( pn.ggplot(df, pn.aes(x='approx_flops', y='params')) + pn.geom_line(pn.aes(color='factor(boardsize)', group='boardsize'), show_legend=False) + pn.geom_line(data=preds, linetype='dashed', size=.25) + pn.geom_point(pn.aes(color='factor(boardsize)', group='boardsize'), data=points, size=.5, show_legend=False) + pn.geom_text(pn.aes( color='factor(boardsize)', group='boardsize', label='boardsize'), data=labs, nudge_y=+.5, show_legend=False, size=6) + pn.annotate( 'text', 1e9, 2e7, label=desc, ha='left', size=6, family='serif') + pn.scale_x_continuous(trans='log10') + pn.scale_y_continuous(trans='log10') + pn.scale_color_hue(l=.4) + pn.labs(x='Train-time compute (FLOPS-seconds)', y='Optimal model size (params)') + plot.IEEE())
def plot_convergence(pile): stops = range(100, int(len(pile) / 10), utils.bills_per_pound) dist_stats = pd.DataFrame([get_sample_dist(pile, size) for size in stops]) return ( pn.ggplot(dist_stats) + pn.geom_line(pn.aes(x='size', y='mean')) + pn.geom_line( pn.aes(x='size', y='lower'), color='#FF5500', linetype='dotted') + pn.geom_line( pn.aes(x='size', y='upper'), color='#FF5500', linetype='dotted') + pn.scale_x_continuous(breaks=stops) + pn.theme(axis_text_x=pn.element_text(angle=270, hjust=1)))
def plot_fees(fees, title, y_axis, years, filename): p = pn.ggplot(fees, pn.aes('year', y_axis, color = 'conference', shape = 'conference')) + \ pn.geom_point() + \ pn.geom_line() + \ pn.labs(title = title, x = 'Year', y = 'Fee (€)') + \ pn.ylim(0, 1000) + \ pn.theme_light() + \ pn.scale_x_continuous(breaks = years) + \ pn.scale_colour_discrete(name = 'Conference') + \ pn.scale_shape_discrete(name = 'Conference') p.save(filename, width=6, height=3, dpi=300)
def plot_series(self, y: str, series_state_df: pd.DataFrame) -> plotnine.ggplot: """ """ aes_kwargs = dict(x="round_number", y=y, color="factor(game_id)", group="factor(game_id)") return (plotnine.ggplot(series_state_df) + plotnine.aes(**aes_kwargs) + plotnine.geom_point() + plotnine.geom_line() + plotnine.theme_light() + plotnine.scale_x_continuous(breaks=range(1, 20, 1)))
def create(self, file_path: str) -> None: (ggplot(self._data, aes("loc")) + geom_histogram(bins=100, fill="#1e4f79") + facet_grid(facets="category ~ .", scales='free_y') + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l]) + ggtitle("Class Sizes") + xlab("Lines of Code") + ylab("Number of Classes") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1 })).save(file_path, width=8, height=18)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) + ggtitle("Distributions of QMOOD Quality Attributes") + xlab("Quality Attribute Value") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={ "wspace": 0.35, "hspace": 0.35 })).save(file_path, width=24, height=12)
def plot_training_curves(ags): df = ags[ags.test_nodes == 64].copy() df['g'] = df.run + df.test_nodes.astype(str) return (pn.ggplot( df, pn.aes(x='train_flops', y='400/np.log(10)*elo', group='g', color='factor(boardsize)')) + pn.geom_line() + pn.geom_point(size=.5) + pn.scale_x_continuous(trans='log10') + pn.scale_color_discrete(name='Boardsize') + pn.labs(x='Training FLOPS', y='Elo v. perfect play', title='All agents\' training curves') + plot.mpl_theme() + plot.poster_sizes())
def test_area_aesthetics(): p = (ggplot(df, aes('x', 'ymax+2', group='factor(z)')) + geom_area() + geom_area(aes('x+width', alpha='z')) + geom_area(aes('x+2*width', linetype='factor(z)'), color='black', fill=None, size=2) + geom_area(aes('x+3*width', color='z'), fill=None, size=2) + geom_area(aes('x+4*width', fill='factor(z)')) + geom_area(aes('x+5*width', size='z'), color='black', fill=None) + scale_x_continuous( breaks=[i * 2 * np.pi for i in range(7)], labels=['0'] + [r'${}\pi$'.format(2 * i) for i in range(1, 7)])) assert p + _theme == 'area_aesthetics'
def plot_sample_efficiency(ags): df = ags.query('boardsize == 9 & test_nodes == 64').copy() df['params'] = df.train_flops / df.samples return ( pn.ggplot( df, pn.aes(x='samples', y='400/np.log(10)*elo', color='params', group='run')) + pn.geom_line() + pn.geom_point() + pn.scale_x_continuous(trans='log10') + pn.scale_color_continuous(trans='log10', name='Params') + pn.labs( title= 'Bigger networks might not be comute efficient, but they are sample efficient', y='Elo v. perfect play', x='Train FLOPS') + plot.mpl_theme() + plot.poster_sizes() + plot.no_colorbar_ticks())
def plot_params(ags): df = ags.query('boardsize == 9 & test_nodes == 64').copy() df['params'] = df.train_flops / df.samples return ( pn.ggplot( df, pn.aes(x='train_flops', y='400/np.log(10)*elo', color='params', group='run')) + pn.geom_line() + pn.geom_point() + pn.scale_x_continuous(trans='log10') + pn.scale_color_continuous(trans='log10', name='Params') + pn.labs( title= 'Smaller networks are more compute efficient for lower performances, but plateau earlier', y='Elo v. perfect play', x='Train FLOPS') + plot.mpl_theme() + plot.poster_sizes() + plot.no_colorbar_ticks())
def test_area_aesthetics(): p = (ggplot(df, aes('x', 'ymax+2', group='factor(z)')) + geom_area() + geom_area(aes('x+width', alpha='z')) + geom_area(aes('x+2*width', linetype='factor(z)'), color='black', fill=None, size=2) + geom_area(aes('x+3*width', color='z'), fill=None, size=2) + geom_area(aes('x+4*width', fill='factor(z)')) + geom_area(aes('x+5*width', size='z'), color='black', fill=None) + scale_x_continuous( breaks=[i*2*np.pi for i in range(7)], labels=['0'] + [r'${}\pi$'.format(2*i) for i in range(1, 7)]) ) assert p + _theme == 'area_aesthetics'
def plot_resid_var_trends(ags): resid_var = data.residual_vars(ags) return ( pn.ggplot( resid_var, pn.aes(x='ratio', y='rv', color='factor(predicted)', group='predicted')) + pn.geom_line(size=2) + pn.geom_text(pn.aes(label='seen'), nudge_y=-.1, size=14) + pn.geom_point(size=4) + pn.scale_x_continuous(trans='log10') + pn.scale_y_continuous(trans='log10') + pn.scale_color_discrete(name='Predicted frontier') + pn.labs( x='(cost of observed frontier)/(cost of predicted frontier)', y='residual variance in performance', title= 'Frontiers of small problems are good, cheap proxies for frontiers of expensive problems' ) + plot.mpl_theme() + plot.poster_sizes())
def test_inplace_add(): p = _p = ggplot(df) p += aes('x', 'y') assert p is _p p += geom_point() assert p is _p p += stat_identity() assert p is _p p += scale_x_continuous() assert p is _p with pytest.warns(PlotnineWarning): # Warning for; replacing existing scale added above p += xlim(0, 10) assert p is _p p += lims(y=(0, 10)) assert p is _p p += labs(x='x') assert p is _p p += coord_trans() assert p is _p p += facet_null() assert p is _p p += annotate('point', 5, 5, color='red', size=5) assert p is _p p += guides() assert p is _p p += theme_gray() assert p is _p th = _th = theme_gray() th += theme(aspect_ratio=1) assert th is _th
def test_wrong_bases(): # x axis not transformed p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + geom_point() ) with pytest.warns(PlotnineWarning): p.draw_test() # x axis not transform, but ticks requested for a different base p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + scale_x_continuous(trans=log_trans(8)) + geom_point() ) with pytest.warns(PlotnineWarning): p.draw_test()
def plot_flops_frontier(ags): df = data.modelled_elos(ags) return ( pn.ggplot( df, pn.aes( x='train_flops', color='factor(boardsize)', group='boardsize')) + pn.geom_line(pn.aes(y='400/np.log(10)*elo'), size=2) + pn.geom_line( pn.aes(y='400/np.log(10)*elohat'), size=1, linetype='dashed') + pn.labs( x='Training FLOPS', y='Elo v. perfect play', title= 'Performance is a sigmoid of compute, linearly scaled by board size' ) + pn.scale_x_continuous(trans='log10') + pn.scale_color_discrete(name='Boardsize') + pn.coord_cartesian(None, (None, 0)) + plot.mpl_theme() + plot.poster_sizes())
def yoy_growth(): """ This creates figures showing the number of questions versus year in dataset """ with open('data/external/datasets/qanta.mapped.2018.04.18.json') as f: year_pages = defaultdict(set) year_questions = Counter() for q in json.load(f)['questions']: if q['page'] is not None: year_pages[q['year']].add(q['page']) year_questions[q['year']] += 1 start_year = min(year_pages) # 2017 is the earlier year we have a full year's worth of data, including partial 2018 isn't accurate end_year = min(2017, max(year_pages)) upto_year_pages = defaultdict(set) upto_year_questions = Counter() for upto_y in range(start_year, end_year + 1): for curr_y in range(start_year, upto_y + 1): upto_year_questions[upto_y] += year_questions[curr_y] for page in year_pages[curr_y]: upto_year_pages[upto_y].add(page) year_page_counts = {} for y, pages in upto_year_pages.items(): year_page_counts[y] = len(pages) year_page_counts year_rows = [] for y, page_count in year_page_counts.items(): year_rows.append({'year': y, 'value': page_count, 'Quantity': 'Distinct Answers'}) year_rows.append({'year': y, 'Quantity': 'Total Questions', 'value': upto_year_questions[y]}) year_df = pd.DataFrame(year_rows) count_cat = CategoricalDtype(categories=['Total Questions', 'Distinct Answers'], ordered=True) year_df['Quantity'] = year_df['Quantity'].astype(count_cat) eprint(year_df[year_df.Quantity == 'Total Questions']) p = ( ggplot(year_df) + aes(x='year', y='value', color='Quantity') + geom_line() + geom_point() + xlab('Year') + ylab('Count up to Year (inclusive)') + theme_fs() + scale_x_continuous(breaks=list(range(start_year, end_year + 1, 2))) ) p.save(path.join(output_path, 'question_answer_counts.pdf'))
def plot_test(ags): df = ags.query('boardsize == 9').groupby('run').apply( lambda df: df[df.idx == df.idx.max()]).copy() df['test_flops'] = df.test_nodes * (df.train_flops / df.samples) subset = df.query('test_nodes == 64').sort_values('test_flops') selection = [ subset.loc[ELO * subset.elo > e].iloc[0].run for e in np.linspace(-2000, -500, 4) ] df = df[df.run.isin(selection)].copy() df['params'] = df.width**2 * df.depth df['arch'] = df.apply(lambda r: '{depth}×{width}'.format(**r), axis=1) labels = df.sort_values('test_flops').reset_index( drop=True).groupby('run').first().reset_index() return (pn.ggplot( df, pn.aes(x='test_flops', y='ELO*elo', color='params', group='run')) + pn.geom_point(size=.25, show_legend=False) + pn.geom_line(size=.5, show_legend=False) + pn.geom_text(pn.aes(label='test_nodes'), nudge_y=-50, show_legend=False, size=4, va='top') + pn.geom_text(pn.aes(label='test_nodes'), nudge_y=-50, show_legend=False, size=4, va='top') + pn.geom_text(pn.aes(label='arch'), data=labels, show_legend=False, size=6, nudge_x=-.1, ha='right') + pn.scale_x_continuous(trans='log10') + pn.scale_color_cmap('plasma', trans='log10', limits=(df.params.min(), 10 * df.params.max())) + pn.coord_cartesian( (3.5, None)) + pn.labs(x='Test-time compute (FLOPS-seconds)', y='Elo v. perfect play') + plot.IEEE())
def create_state_plot( self, x: str, y: str, data: pd.DataFrame, aes_kwargs: Tuple[str, str] = None) -> plotnine.ggplot: """ """ aes_kwargs = ({} if aes_kwargs is None else aes_kwargs) x_breaks = data[x] max_y = max(data[y]) y_interval = int(max_y / 20) + 1 y_breaks = range(0, max_y + 1, y_interval) return (plotnine.ggplot(data) + plotnine.aes(x=x, y=y, **aes_kwargs) + plotnine.geom_line() + plotnine.geom_point() + plotnine.theme_light() + plotnine.scale_x_continuous(breaks=x_breaks) + plotnine.scale_y_continuous(breaks=y_breaks))
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the source image target_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2]}) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float}) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png')) elif params.debug == 'plot': print(p1)
def analyze_nir_intensity(gray_img, mask, bins=256, histplot=False): """This function calculates the intensity of each pixel associated with the plant and writes the values out to a file. It can also print out a histogram plot of pixel intensity and a pseudocolor image of the plant. Inputs: gray_img = 8- or 16-bit grayscale image data mask = Binary mask made from selected contours bins = number of classes to divide spectrum into histplot = if True plots histogram of intensity values Returns: analysis_images = NIR histogram image :param gray_img: numpy array :param mask: numpy array :param bins: int :param histplot: bool :return analysis_images: list """ params.device += 1 # apply plant shaped mask to image mask1 = binary_threshold(mask, 0, 255, 'light') mask1 = (mask1 / 255) masked = np.multiply(gray_img, mask1) # calculate histogram if gray_img.dtype == 'uint16': maxval = 65536 else: maxval = 256 # Make a pseudo-RGB image rgbimg = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR) hist_nir, hist_bins = np.histogram(masked, bins, (1, maxval)) hist_bins1 = hist_bins[:-1] hist_bins2 = [float(round(l, 2)) for l in hist_bins1] hist_nir1 = [float(l) for l in hist_nir] # make hist percentage for plotting pixels = cv2.countNonZero(mask1) hist_percent = (hist_nir / float(pixels)) * 100 # No longer returning a pseudocolored image # make mask to select the background # mask_inv = cv2.bitwise_not(mask) # img_back = cv2.bitwise_and(rgbimg, rgbimg, mask=mask_inv) # img_back1 = cv2.applyColorMap(img_back, colormap=1) # mask the background and color the plant with color scheme 'jet' # cplant = cv2.applyColorMap(rgbimg, colormap=2) # masked1 = apply_mask(cplant, mask, 'black') masked1 = cv2.bitwise_and(rgbimg, rgbimg, mask=mask) # cplant_back = cv2.add(masked1, img_back1) if params.debug is not None: if params.debug == "print": print_image(masked1, os.path.join(params.debug_outdir, str(params.device) + "_masked_nir_plant.jpg")) if params.debug == "plot": plot_image(masked1) analysis_images = [] if histplot is True: hist_x = hist_percent bin_labels = np.arange(0, bins) dataset = pd.DataFrame({'Grayscale pixel intensity': bin_labels, 'Proportion of pixels (%)': hist_x}) fig_hist = (ggplot(data=dataset, mapping=aes(x='Grayscale pixel intensity', y='Proportion of pixels (%)')) + geom_line(color='red') + scale_x_continuous(breaks=list(range(0, bins, 25)))) analysis_images.append(fig_hist) if params.debug == "print": fig_hist.save(os.path.join(params.debug_outdir, str(params.device) + '_nir_hist.png')) elif params.debug == "plot": print(fig_hist) outputs.add_observation(variable='nir_frequencies', trait='near-infrared frequencies', method='plantcv.plantcv.analyze_nir_intensity', scale='frequency', datatype=list, value=hist_nir1, label=hist_bins2) # Store images outputs.images.append(analysis_images) return analysis_images
def analyze_color(rgb_img, mask, hist_plot_type=None): """Analyze the color properties of an image object Inputs: rgb_img = RGB image data mask = Binary mask made from selected contours hist_plot_type = 'None', 'all', 'rgb','lab' or 'hsv' Returns: analysis_image = histogram output :param rgb_img: numpy.ndarray :param mask: numpy.ndarray :param hist_plot_type: str :return analysis_images: list """ params.device += 1 if len(np.shape(rgb_img)) < 3: fatal_error("rgb_img must be an RGB image") # Mask the input image masked = cv2.bitwise_and(rgb_img, rgb_img, mask=mask) # Extract the blue, green, and red channels b, g, r = cv2.split(masked) # Convert the BGR image to LAB lab = cv2.cvtColor(masked, cv2.COLOR_BGR2LAB) # Extract the lightness, green-magenta, and blue-yellow channels l, m, y = cv2.split(lab) # Convert the BGR image to HSV hsv = cv2.cvtColor(masked, cv2.COLOR_BGR2HSV) # Extract the hue, saturation, and value channels h, s, v = cv2.split(hsv) # Color channel dictionary channels = {"b": b, "g": g, "r": r, "l": l, "m": m, "y": y, "h": h, "s": s, "v": v} # Histogram plot types hist_types = {"ALL": ("b", "g", "r", "l", "m", "y", "h", "s", "v"), "RGB": ("b", "g", "r"), "LAB": ("l", "m", "y"), "HSV": ("h", "s", "v")} if hist_plot_type is not None and hist_plot_type.upper() not in hist_types: fatal_error("The histogram plot type was " + str(hist_plot_type) + ', but can only be one of the following: None, "all", "rgb", "lab", or "hsv"!') # Store histograms, plotting colors, and plotting labels histograms = { "b": {"label": "blue", "graph_color": "blue", "hist": [float(l[0]) for l in cv2.calcHist([channels["b"]], [0], mask, [256], [0, 255])]}, "g": {"label": "green", "graph_color": "forestgreen", "hist": [float(l[0]) for l in cv2.calcHist([channels["g"]], [0], mask, [256], [0, 255])]}, "r": {"label": "red", "graph_color": "red", "hist": [float(l[0]) for l in cv2.calcHist([channels["r"]], [0], mask, [256], [0, 255])]}, "l": {"label": "lightness", "graph_color": "dimgray", "hist": [float(l[0]) for l in cv2.calcHist([channels["l"]], [0], mask, [256], [0, 255])]}, "m": {"label": "green-magenta", "graph_color": "magenta", "hist": [float(l[0]) for l in cv2.calcHist([channels["m"]], [0], mask, [256], [0, 255])]}, "y": {"label": "blue-yellow", "graph_color": "yellow", "hist": [float(l[0]) for l in cv2.calcHist([channels["y"]], [0], mask, [256], [0, 255])]}, "h": {"label": "hue", "graph_color": "blueviolet", "hist": [float(l[0]) for l in cv2.calcHist([channels["h"]], [0], mask, [256], [0, 255])]}, "s": {"label": "saturation", "graph_color": "cyan", "hist": [float(l[0]) for l in cv2.calcHist([channels["s"]], [0], mask, [256], [0, 255])]}, "v": {"label": "value", "graph_color": "orange", "hist": [float(l[0]) for l in cv2.calcHist([channels["v"]], [0], mask, [256], [0, 255])]} } # Create list of bin labels for 8-bit data binval = np.arange(0, 256) bin_values = [l for l in binval] analysis_images = [] # Create a dataframe of bin labels and histogram data dataset = pd.DataFrame({'bins': binval, 'blue': histograms["b"]["hist"], 'green': histograms["g"]["hist"], 'red': histograms["r"]["hist"], 'lightness': histograms["l"]["hist"], 'green-magenta': histograms["m"]["hist"], 'blue-yellow': histograms["y"]["hist"], 'hue': histograms["h"]["hist"], 'saturation': histograms["s"]["hist"], 'value': histograms["v"]["hist"]}) # Make the histogram figure using plotnine if hist_plot_type is not None: if hist_plot_type.upper() == 'RGB': df_rgb = pd.melt(dataset, id_vars=['bins'], value_vars=['blue', 'green', 'red'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_rgb, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['blue', 'green', 'red']) ) analysis_images.append(hist_fig) elif hist_plot_type.upper() == 'LAB': df_lab = pd.melt(dataset, id_vars=['bins'], value_vars=['lightness', 'green-magenta', 'blue-yellow'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_lab, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['yellow', 'magenta', 'dimgray']) ) analysis_images.append(hist_fig) elif hist_plot_type.upper() == 'HSV': df_hsv = pd.melt(dataset, id_vars=['bins'], value_vars=['hue', 'saturation', 'value'], var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_hsv, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(['blueviolet', 'cyan', 'orange']) ) analysis_images.append(hist_fig) elif hist_plot_type.upper() == 'ALL': s = pd.Series(['blue', 'green', 'red', 'lightness', 'green-magenta', 'blue-yellow', 'hue', 'saturation', 'value'], dtype="category") color_channels = ['blue', 'yellow', 'green', 'magenta', 'blueviolet', 'dimgray', 'red', 'cyan', 'orange'] df_all = pd.melt(dataset, id_vars=['bins'], value_vars=s, var_name='Color Channel', value_name='Pixels') hist_fig = (ggplot(df_all, aes(x='bins', y='Pixels', color='Color Channel')) + geom_line() + scale_x_continuous(breaks=list(range(0, 256, 25))) + scale_color_manual(color_channels) ) analysis_images.append(hist_fig) # Hue values of zero are red but are also the value for pixels where hue is undefined # The hue value of a pixel will be undefined when the color values are saturated # Therefore, hue values of zero are excluded from the calculations below # Calculate the median hue value # The median is rescaled from the encoded 0-179 range to the 0-359 degree range hue_median = np.median(h[np.where(h > 0)]) * 2 # Calculate the circular mean and standard deviation of the encoded hue values # The mean and standard-deviation are rescaled from the encoded 0-179 range to the 0-359 degree range hue_circular_mean = stats.circmean(h[np.where(h > 0)], high=179, low=0) * 2 hue_circular_std = stats.circstd(h[np.where(h > 0)], high=179, low=0) * 2 # Store into lists instead for pipeline and print_results # stats_dict = {'mean': circular_mean, 'std' : circular_std, 'median': median} # Plot or print the histogram if hist_plot_type is not None: if params.debug == 'print': hist_fig.save(os.path.join(params.debug_outdir, str(params.device) + '_analyze_color_hist.png')) elif params.debug == 'plot': print(hist_fig) # Store into global measurements # RGB signal values are in an unsigned 8-bit scale of 0-255 rgb_values = [i for i in range(0, 256)] # Hue values are in a 0-359 degree scale, every 2 degrees at the midpoint of the interval hue_values = [i * 2 + 1 for i in range(0, 180)] # Percentage values on a 0-100 scale (lightness, saturation, and value) percent_values = [round((i / 255) * 100, 2) for i in range(0, 256)] # Diverging values on a -128 to 127 scale (green-magenta and blue-yellow) diverging_values = [i for i in range(-128, 128)] # outputs.measurements['color_data'] = { # 'histograms': { # 'blue': {'signal_values': rgb_values, 'frequency': histograms["b"]["hist"]}, # 'green': {'signal_values': rgb_values, 'frequency': histograms["g"]["hist"]}, # 'red': {'signal_values': rgb_values, 'frequency': histograms["r"]["hist"]}, # 'lightness': {'signal_values': percent_values, 'frequency': histograms["l"]["hist"]}, # 'green-magenta': {'signal_values': diverging_values, 'frequency': histograms["m"]["hist"]}, # 'blue-yellow': {'signal_values': diverging_values, 'frequency': histograms["y"]["hist"]}, # 'hue': {'signal_values': hue_values, 'frequency': histograms["h"]["hist"]}, # 'saturation': {'signal_values': percent_values, 'frequency': histograms["s"]["hist"]}, # 'value': {'signal_values': percent_values, 'frequency': histograms["v"]["hist"]} # }, # 'color_features': { # 'hue_circular_mean': hue_circular_mean, # 'hue_circular_std': hue_circular_std, # 'hue_median': hue_median # } # } outputs.add_observation(variable='blue_frequencies', trait='blue frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["b"]["hist"], label=rgb_values) outputs.add_observation(variable='green_frequencies', trait='green frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["g"]["hist"], label=rgb_values) outputs.add_observation(variable='red_frequencies', trait='red frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["r"]["hist"], label=rgb_values) outputs.add_observation(variable='lightness_frequencies', trait='lightness frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["l"]["hist"], label=percent_values) outputs.add_observation(variable='green-magenta_frequencies', trait='green-magenta frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["m"]["hist"], label=diverging_values) outputs.add_observation(variable='blue-yellow_frequencies', trait='blue-yellow frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["y"]["hist"], label=diverging_values) outputs.add_observation(variable='hue_frequencies', trait='hue frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["h"]["hist"], label=hue_values) outputs.add_observation(variable='saturation_frequencies', trait='saturation frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["s"]["hist"], label=percent_values) outputs.add_observation(variable='value_frequencies', trait='value frequencies', method='plantcv.plantcv.analyze_color', scale='frequency', datatype=list, value=histograms["v"]["hist"], label=percent_values) outputs.add_observation(variable='hue_circular_mean', trait='hue circular mean', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_circular_mean, label='degrees') outputs.add_observation(variable='hue_circular_std', trait='hue circular standard deviation', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_median, label='degrees') outputs.add_observation(variable='hue_median', trait='hue median', method='plantcv.plantcv.analyze_color', scale='degrees', datatype=float, value=hue_median, label='degrees') # Store images outputs.images.append(analysis_images) return analysis_images
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay['control_correct_positions'] control_wrong_positions = gameplay['control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0] control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x}) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay['advneural_correct_positions'] adv_wrong_positions = gameplay['advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = ( p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') ) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5}) ) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions') ) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth(method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
def syntactic_diversity_plots(): with open('data/external/syntactic_diversity_table.json') as f: rows = json.load(f) parse_df = pd.DataFrame(rows) parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses'] melt_df = pd.melt( parse_df, id_vars=['dataset', 'depth', 'overlap', 'parses'], value_vars=['parse_ratio', 'unique_parses'], var_name='metric', value_name='y' ) def label_facet(name): if name == 'parse_ratio': return 'Average Unique Parses per Instance' elif name == 'unique_parses': return 'Count of Unique Parses' def label_y(ys): formatted_ys = [] for y in ys: y = str(y) if y.endswith('000.0'): formatted_ys.append(y[:-5] + 'K') else: formatted_ys.append(y) return formatted_ys p = ( ggplot(melt_df) + aes(x='depth', y='y', color='dataset') + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet) + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('') + scale_color_discrete(name='Dataset') + scale_y_continuous(labels=label_y) + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'syn_div_plot.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='unique_parses', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Count of Unique Parses') + scale_color_discrete(name='Dataset') + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'n_unique_parses.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='parse_ratio', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Average Unique Parses per Instance') + scale_color_discrete(name='Dataset') + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + scale_y_continuous(limits=[0, 1]) + theme_fs() ) p.save(path.join(output_path, 'parse_ratio.pdf'))