def test_labels(): """ Test invalid arguments to chart components """ gg = ggplot(df, aes(x='x', y='y')) gg = gg + geom_point() gg = gg + xlab('xlab') gg = gg + ylab('ylab') gg = gg + ggtitle('title') assert gg.labels['x'] == 'xlab' assert gg.labels['y'] == 'ylab' assert gg.labels['title'] == 'title' gg = gg + labs(x='xlab2', y='ylab2', title='title2') assert gg.labels['x'] == 'xlab2' assert gg.labels['y'] == 'ylab2' assert gg.labels['title'] == 'title2' with pytest.raises(PlotnineError): gg = gg + xlab(None) with pytest.raises(PlotnineError): gg = gg + ylab(None) with pytest.raises(PlotnineError): gg = gg + ggtitle(None) with pytest.raises(PlotnineError): gg = gg + labs('x', 'y')
def plot_overlap_duration(self, data, options): matches = data["matches"] matches = matches.loc[matches.tag_overlap > 0] # matches.loc[:, "log_dur"] = log() plt = ggplot(data=matches, mapping=aes(x="tag_duration", y="tag_overlap",),) plt = ( plt + geom_point() + xlab("Tag duration") + ylab("Proportion tag overlapping with matching event") + theme_classic() + theme( axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}), plot_title=element_text( weight="bold", size=14, margin={"t": 10, "b": 10} ), figure_size=(10, 10), text=element_text(size=12, weight="bold"), ) + ggtitle( ( "Proportion of tag overlapping with matching event depending on duration " + "size for model {}, database {}, class {}\n" + "with detector options {}" ).format( options["scenario_info"]["model"], options["scenario_info"]["database"], options["scenario_info"]["class"], options, ) ) ) return plt
def plot_replicate_groups(self): from plotnine import ggplot, aes, ylab, xlab, geom_line, scale_y_continuous, geom_col, geom_point df1 = self.data1df df2 = self.data2df df1.insert(0, 'Experiment', '1') df2.insert(0, 'Experiment', '2') #len1 = len(df1.index) #len2 = len(df2.index) #print len1-len2 #exit() #if len1 > len2: # df1 = df1.drop(df1.tail(len1 - len2).index, inplace=True) #else: # df2 = df2.drop(df2.tail(len2 - len1).index, inplace=True) # df = pd.concat([df1, df2]) print(df1) print(df2) plot = ((ggplot() + ylab(u'Current (μA)') + xlab('Time (seconds)') + geom_line(df1, aes('Time', 'Current', color='Channel')) + geom_line(df2, aes('Time', 'Current', color='Channel')))) print(plot) return plot
def accPlot(accsByNFeats): plotdata = [] for s in accsByNFeats: plotdata.append( pd.concat([ pd.DataFrame({ "p": p, "acc": accsByNFeats[s][p], "set": s }, index=[str(p)]) for p in accsByNFeats[s] ], axis=0)) ggd = pd.concat(plotdata) ggd['acc'] = ggd['acc'].astype(float) ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set')) ggo += gg.geom_line(alpha=0.5) ggo += gg.geom_point() ggo += gg.theme_bw() ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000]) ggo += gg.scale_color_manual( values=['darkgray', 'black', 'red', 'dodgerblue']) ggo += gg.ylab('Accuracy (5-fold CV)') print(ggo) return ggd
def plot_key_stock_indicators(df, stock): assert isinstance(df, pd.DataFrame) assert all([ 'eps' in df.columns, 'pe' in df.columns, 'annual_dividend_yield' in df.columns ]) df['volume'] = df['last_price'] * df[ 'volume'] / 1000000 # again, express as $(M) df['fetch_date'] = df.index plot_df = pd.melt(df, id_vars='fetch_date', value_vars=[ 'pe', 'eps', 'annual_dividend_yield', 'volume', 'last_price' ], var_name='indicator', value_name='value') plot_df['value'] = pd.to_numeric(plot_df['value']) plot_df['fetch_date'] = pd.to_datetime(plot_df['fetch_date']) plot = ( p9.ggplot(plot_df, p9.aes('fetch_date', 'value', color='indicator')) + p9.geom_line(size=1.5, show_legend=False) + p9.facet_wrap('~ indicator', nrow=6, ncol=1, scales='free_y') + p9.theme(axis_text_x=p9.element_text(angle=30, size=7), figure_size=(8, 7)) # + p9.aes(ymin=0) + p9.xlab("") + p9.ylab("")) return plot_as_inline_html_data(plot)
def plot_fundamentals(df, stock) -> str: assert isinstance(df, pd.DataFrame) columns_to_report = ["pe", "eps", "annual_dividend_yield", "volume", \ "last_price", "change_in_percent_cumulative", \ "change_price", "market_cap", "number_of_shares"] colnames = df.columns for column in columns_to_report: assert column in colnames df["volume"] = df["last_price"] * df["volume"] / 1000000 # again, express as $(M) df["market_cap"] /= 1000 * 1000 df["number_of_shares"] /= 1000 * 1000 df["fetch_date"] = df.index plot_df = pd.melt( df, id_vars="fetch_date", value_vars=columns_to_report, var_name="indicator", value_name="value", ) plot_df["value"] = pd.to_numeric(plot_df["value"]) plot_df["fetch_date"] = pd.to_datetime(plot_df["fetch_date"]) plot = ( p9.ggplot(plot_df, p9.aes("fetch_date", "value", color="indicator")) + p9.geom_line(size=1.5, show_legend=False) + p9.facet_wrap("~ indicator", nrow=len(columns_to_report), ncol=1, scales="free_y") + p9.theme(axis_text_x=p9.element_text(angle=30, size=7), axis_text_y=p9.element_text(size=7), figure_size=(8, len(columns_to_report))) # + p9.aes(ymin=0) + p9.xlab("") + p9.ylab("") ) return plot_as_inline_html_data(plot)
def round_2_plot(): if not os.path.exists(round_2_df_path): eprint(f'Downloading {round_2_df_url} to {round_2_df_path}') urlretrieve(round_2_df_url, round_2_df_path) verify_checksum(round_2_df_checksum, round_2_df_path) df = pd.read_json(round_2_df_path) p = ( ggplot(df) + aes(x='char_percent', y='correct', color='Dataset') + facet_wrap('Guessing_Model', nrow=1) + stat_summary_bin( fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=[0, 0.7]) + ggtitle('Round 2 Attacks and Models') + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')) p.save('2019_tacl_trick/auto_fig/round_2_json.pdf', width=7.0, height=1.7)
def plot_predict(forecast): p = (ggplot(data=forecast, mapping=aes(x='ds', y='y')) + geom_point(colour='blue', alpha=0.3, na_rm=True) + geom_line(colour='blue', na_rm=True) + geom_line( data=forecast, mapping=aes(x='ds', y='yhat'), colour='red') + geom_ribbon(data=forecast, mapping=aes(ymin='yhat_lower', ymax='yhat_upper'), fill='blue', alpha=0.1) + scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') + xlab('Time') + ylab('Pressure') + theme_bw() + theme(axis_text_x=element_text( angle=45, hjust=1, face='bold', color='black'), axis_text_y=element_text(face='bold', colour='black'))) ggplot.save(p, filename='predict_pressure_chart.png', path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'png'), width=8, height=6, units='in', dpi=326, verbose=False) return p
def make_sentiment_plot(sentiment_df, exclude_zero_bin=True, plot_text_labels=True): rows = [] print( "Sentiment plot: exclude zero bins? {} show text? {}".format( exclude_zero_bin, plot_text_labels ) ) for column in filter(lambda c: c.startswith("bin_"), sentiment_df.columns): c = Counter(sentiment_df[column]) date = column[4:] for bin_name, val in c.items(): if exclude_zero_bin and (bin_name == "0.0" or not isinstance(bin_name, str)): continue bin_name = str(bin_name) assert isinstance(bin_name, str) val = int(val) rows.append( { "date": datetime.strptime(date, "%Y-%m-%d"), "bin": bin_name, "value": val, } ) df = pd.DataFrame.from_records(rows) # print(df['bin'].unique()) # HACK TODO FIXME: should get from price_change_bins()... order = [ "-1000.0", "-100.0", "-10.0", "-5.0", "-3.0", "-2.0", "-1.0", "-1e-06", "1e-06", "1.0", "2.0", "3.0", "5.0", "10.0", "25.0", "100.0", "1000.0", ] df["bin_ordered"] = pd.Categorical(df["bin"], categories=order) plot = ( p9.ggplot(df, p9.aes("date", "bin_ordered", fill="value")) + p9.geom_tile(show_legend=False) + p9.theme_bw() + p9.xlab("") + p9.ylab("Percentage daily change") + p9.theme(axis_text_x=p9.element_text(angle=30, size=7), figure_size=(10, 5)) ) if plot_text_labels: plot = plot + p9.geom_text(p9.aes(label="value"), size=8, color="white") return plot_as_inline_html_data(plot)
def plot_replicate_density( df, batch, plate, output_file_base=None, output_file_extensions=[".png", ".pdf", ".svg"], dpi=300, height=1.5, width=2, ): density_gg = ( gg.ggplot(df, gg.aes(x="pairwise_correlation", fill="replicate_info")) + gg.geom_density(alpha=0.3) + gg.scale_fill_manual( name="Replicate", labels={ "True": "True", "False": "False" }, values=["#B99638", "#2DB898"], ) + gg.xlab("Pearson Correlation") + gg.ylab("Density") + gg.ggtitle("{}: {}".format(batch, plate)) + gg.theme_bw() + gg.theme( title=gg.element_text(size=9), axis_text=gg.element_text(size=5), axis_title=gg.element_text(size=8), legend_text=gg.element_text(size=6), legend_title=gg.element_text(size=7), strip_text=gg.element_text(size=4, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), )) if output_file_base: save_figure(density_gg, output_file_base, output_file_extensions, dpi, height, width) return density_gg
def customized_algorithm_plot(experiment_name='finite_simple_sanity', data_path=_DEFAULT_DATA_PATH): """Simple plot of average instantaneous regret by agent, per timestep. Args: experiment_name: string = name of experiment config. data_path: string = where to look for the files. Returns: p: ggplot plot """ df = load_data(experiment_name, data_path) plt_df = (df.groupby(['t', 'agent']).agg({ 'instant_regret': np.mean }).reset_index()) plt_df['agent_new_name'] = plt_df.agent.apply(rename_agent) custom_labels = ['Laplace TS', 'Langevin TS', 'TS', 'bootstrap TS'] custom_colors = ["#E41A1C", "#377EB8", "#4DAF4A", "#984EA3"] p = (gg.ggplot(plt_df) + gg.aes('t', 'instant_regret', colour='agent_new_name') + gg.geom_line(size=1.25, alpha=0.75) + gg.xlab('time period (t)') + gg.ylab('per-period regret') + gg.scale_color_manual( name='agent', labels=custom_labels, values=custom_colors)) return p
def plot_mem(df): x = df.copy() # initialise some extra columns useful for plotting x['new_cols'] = [str(i) for i in x['col_name']] x['new_cols'] = pd.Categorical(x['new_cols'], categories=x['new_cols'], ordered=True) x['cnt_print_loc_pos'] = (x.pcnt.values) + (np.max(x.pcnt.values)) / 70 x['cnt_print_loc_neg'] = (x.pcnt.values) - (np.max(x.pcnt.values)) / 70 # build basic plot ggplt = p9.ggplot(x, p9.aes(x = 'new_cols', y = 'pcnt', fill = 'new_cols')) \ + p9.geom_bar(stat = 'identity') \ + p9.guides(fill = False) \ + p9.ylab('% of total size') \ + p9.xlab('') \ + p9.theme(axis_text_x=p9.element_text(rotation = 45, hjust=1)) # add text labels to the highest bars y1 = x.copy()[x.pcnt > 0.3 * np.max(x.pcnt)] ggplt = ggplt + \ p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_neg', label = 'size', \ fill = 'col_name'), inherit_aes = False, data = y1, color = 'white', \ angle = 90, vjust = 'top') # add text labels to the lower bars y2 = x.copy()[x.pcnt <= 0.3 * np.max(x.pcnt)] ggplt = ggplt + \ p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_pos', label = 'size', \ fill = 'col_name'), inherit_aes = False, data = y2, color = 'gray', \ angle = 90, vjust = 'bottom') return ggplt
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text(aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False) + geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black') + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, )) return plt
def cum_regret_plot(experiment_name, data_path=_DEFAULT_DATA_PATH): """Simple plot of average instantaneous regret by agent, per timestep. Args: experiment_name: string = name of experiment config. data_path: string = where to look for the files. Returns: https://web.stanford.edu/~bvr/pubs/TS_Tutorial.pdf """ df = load_data(experiment_name, data_path) plt_df = (df.groupby(['t', 'agent']).agg({ 'cum_regret': [np.mean, lower_interval, upper_interval] }).reset_index()) plt_df.columns = ['_'.join(i) for i in plt_df.columns.values] p = (gg.ggplot(plt_df) + gg.aes('t_', 'cum_regret_mean', colour='agent_') + gg.geom_line(size=1.25, alpha=0.75) + gg.geom_ribbon(gg.aes(ymin='cum_regret_lower_interval', ymax='cum_regret_upper_interval', fill='agent_'), alpha=0.1) + gg.xlab('time period (t)') + gg.ylab('cumulative regret') + gg.scale_colour_brewer(name='agent_', type='qual', palette='Set1')) plot_dict = {experiment_name + '_cum_regret': p} return plot_dict
def round_1_plot(): df = pd.read_csv('2019_tacl_trick/data/round_1.csv') model_dtype = CategoricalDtype(['DAN', 'RNN', 'IR'], ordered=True) df['Model'] = df['Model'].astype(model_dtype) # This following is a hack so that the legend widths are the same across plots def rename(x): if x == 'Round 1 - IR Adversarial': return 'Round 1 - IR Adversarial ' else: return x df['Dataset'] = df['Dataset'].map(rename) p = (ggplot(df) + aes(x='x', y='y', color='Dataset') + facet_wrap('Model', nrow=1) + geom_point(size=1.0, shape='o') + scale_y_continuous(breaks=np.linspace(0, 1, 6), limits=[0, 0.6]) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + ggtitle('Round 1 Attacks and Models') + theme(strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual( values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')) p.save('2019_tacl_trick/auto_fig/round_1_csv.pdf', width=7.0, height=1.7)
def plot_zmw_stats(self, **kwargs): """Plot of ZMW stats for all runs. Note ---- Raises an error if :meth:`Summaries.has_zmw_stats` is not `True`. Parameters ---------- ``**kwargs`` : dict Keyword arguments passed to :meth:`Summaries.zmw_stats`. Returns ------- plotnine.ggplot.ggplot Stacked bar graph of ZMW stats for each run. """ df = self.zmw_stats(**kwargs) p = (p9.ggplot(df, p9.aes(x='name', y='number', fill='status')) + p9.geom_col(position=p9.position_stack(reverse=True), width=0.8) + p9.theme(axis_text_x=p9.element_text(angle=90, vjust=1, hjust=0.5), figure_size=(0.4 * len(df['name'].unique()), 2.5) ) + p9.ylab('number of ZMWs') + p9.xlab('') ) if len(df['status'].unique()) < len(CBPALETTE): p = p + p9.scale_fill_manual(CBPALETTE[1:]) return p
def summary(tags, opts=None): print(tags) tags_summary = ( tags.groupby(["tag", "background"]) .agg({"tag": "count"}) .rename(columns={"tag": "n_tags"}) .reset_index() .astype({"background": "category", "tag": "category"}) ) print(tags_summary) # tags_summary = tags_df.groupby(["species"]).agg( # {"tag_duration": "sum", "species": "count"} # ) # tags_summary.rename(columns={"species": "count"}, inplace=True) # tags_summary["tag_duration"] = tags_summary.tag_duration.astype(int) # tags_summary["duration"] = tags_summary.tag_duration.astype(str) + "s" # tags_summary = tags_summary.reindex(list(SPECIES_LABELS.keys())) # # tags_summary["species"] = tags_summary.index # tags_summary.reset_index(inplace=True) # tags_summary # ( # ggplot( # data=tags_summary, # mapping=aes( # x="factor(species, ordered=False)", # y="tag_duration", # fill="factor(species, ordered=False)", # ), # ) # + geom_bar(stat="identity", show_legend=False) # + xlab("Species") # + ylab("Duration of annotations (s)") # + geom_text(mapping=aes(label="count"), nudge_y=15) # + theme_classic() # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) # ).save("species_repartition_duration_mini.png", width=10, height=8) plt = ( ggplot( data=tags_summary, mapping=aes( x="tag", # "factor(species, ordered=False)", y="n_tags", fill="background", # "factor(species, ordered=False)", ), ) + geom_bar(stat="identity", show_legend=True, position=position_dodge()) + xlab("Species") + ylab("Number of annotations") + geom_text(mapping=aes(label="n_tags"), nudge_y=15) + theme_classic() + theme(axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30})) # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) ).save("tag_species_bg.png", width=10, height=8) # print(tags_summary) print(plt)
def misspecified_plot(experiment_name='finite_misspecified', data_path=_DEFAULT_DATA_PATH): """Specialized plotting script for TS tutorial paper misspecified TS.""" df = load_data(experiment_name, data_path) def _parse_np_array(np_string): return np.array(np_string.replace('[', '') .replace(']', '') .strip() .split()) df['posterior_mean'] = df.posterior_mean.apply(_parse_np_array) # Action means new_col_list = ['mean_0', 'mean_1', 'mean_2'] for n, col in enumerate(new_col_list): df[col] = df['posterior_mean'].apply(lambda x: float(x[n])) plt_df = (df.groupby(['agent', 't']) .agg({'instant_regret': np.mean, 'mean_0': np.mean, 'mean_1': np.mean, 'mean_2': np.mean}) .reset_index()) regret_plot = (gg.ggplot(plt_df) + gg.aes('t', 'instant_regret', colour='agent') + gg.geom_line(size=1.25, alpha=0.75) + gg.xlab('Timestep (t)') + gg.ylab('Average instantaneous regret') + gg.scale_colour_brewer(name='Agent', type='qual', palette='Set1') + gg.coord_cartesian(ylim=(0, 0.02))) melt_df = pd.melt(plt_df, id_vars=['agent', 't'], value_vars=new_col_list) melt_df['group_id'] = melt_df.agent + melt_df.variable action_plot = (gg.ggplot(melt_df) + gg.aes('t', 'value', colour='agent', group='group_id') + gg.geom_line(size=1.25, alpha=0.75) + gg.coord_cartesian(ylim=(0, 0.05)) + gg.xlab('Timestep (t)') + gg.ylab('Expected mean reward') + gg.scale_colour_brewer(name='Agent', type='qual', palette='Set1')) plot_dict = {} plot_dict['misspecified_regret'] = regret_plot plot_dict['misspecified_action'] = action_plot return plot_dict
def plot_seeds(df: pd.DataFrame, sweep_vars: Sequence[str] = None) -> gg.ggplot: """Plot the performance by individual work unit.""" return mnist_analysis.plot_seeds( df_in=df, sweep_vars=sweep_vars, colour_var='noise_scale' ) + gg.ylab('average accuracy (removing noise)')
def plot_seeds(df: pd.DataFrame, sweep_vars: Optional[Sequence[str]] = None) -> gg.ggplot: """Plot the performance by individual work unit.""" return bandit_analysis.plot_seeds( df_in=df, sweep_vars=sweep_vars, colour_var='reward_scale' ) + gg.ylab('average episodic return (after rescaling)')
def plot_seeds(df: pd.DataFrame, sweep_vars: Optional[Sequence[str]] = None) -> gg.ggplot: """Plot the performance by individual work unit.""" return catch_analysis.plot_seeds( df_in=df, sweep_vars=sweep_vars, colour_var='noise_scale' ) + gg.ylab('average episodic return (removing noise)')
def __plot( self, plot_data, x, y, colour, lbl_x, lbl_y, facet, facet_scales, facet_by, smoothed, points, error_bars, save, ): cbbPalette = [ "#000000", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", ] plt = ggplot(data=plot_data, mapping=aes(x=x, y=y, colour=colour)) plt += xlab(lbl_x) plt += ylab(lbl_y) # + facet_grid("site~", scales="free") # + geom_line() if facet: # TODO: use facet as save nrow, ncol = self.get_facet_rows(plot_data, facet_by) plt += facet_wrap(facet_by, nrow=nrow, ncol=ncol, scales=facet_scales) if points: plt += geom_point() if error_bars: # TODO use generic way to compute them pass # self.plt += geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) # TODO: use smooth as save if smoothed: plt += geom_smooth( method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}, ) else: plt += geom_line() plt += scale_colour_manual(values=cbbPalette, guide=False) plt += scale_x_continuous(labels=label_x) plt += theme(figure_size=(15, 18), dpi=150) if save: plt.save(**save) return plt
def plot_market_wide_sector_performance(all_stocks_cip: pd.DataFrame): """ Display specified dates for average sector performance. Each company is assumed to have at zero at the start of the observation period. A plot as base64 data is returned. """ n_stocks = len(all_stocks_cip) # merge in sector information for each company code_and_sector = stocks_by_sector() n_unique_sectors = len(code_and_sector["sector_name"].unique()) print("Found {} unique sectors".format(n_unique_sectors)) #print(df) #print(code_and_sector) df = all_stocks_cip.merge(code_and_sector, left_index=True, right_on="asx_code") print( "Found {} stocks, {} sectors and merged total: {}".format( n_stocks, len(code_and_sector), len(df) ) ) # compute average change in percent of each unique sector over each day and sum over the dates cumulative_pct_change = df.expanding(axis="columns").sum() # merge date-wise into df for date in cumulative_pct_change.columns: df[date] = cumulative_pct_change[date] # df.to_csv('/tmp/crap.csv') grouped_df = df.groupby("sector_name").mean() # grouped_df.to_csv('/tmp/crap.csv') # ready the dataframe for plotting grouped_df = pd.melt( grouped_df, ignore_index=False, var_name="date", value_name="cumulative_change_percent", ) grouped_df["sector"] = grouped_df.index grouped_df["date"] = pd.to_datetime(grouped_df["date"]) n_col = 3 plot = ( p9.ggplot( grouped_df, p9.aes("date", "cumulative_change_percent", color="sector") ) + p9.geom_line(size=1.0) + p9.facet_wrap( "~sector", nrow=n_unique_sectors // n_col + 1, ncol=n_col, scales="free_y" ) + p9.xlab("") + p9.ylab("Average sector change (%)") + p9.theme( axis_text_x=p9.element_text(angle=30, size=6), axis_text_y=p9.element_text(size=6), figure_size=(12, 6), panel_spacing=0.3, legend_position="none", ) ) return plot_as_inline_html_data(plot)
def plot_compare_accuracy(self, expo=False): if expo: return (ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy')) else: return (ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity'))
def plot_ccs_stats(self, variable, *, trim_frac=0.005, bins=25, histogram_stat='count', maxcol=None, panelsize=1.75): """Plot histograms of CCS stats for all runs. Parameters ---------- variable : {'length', 'passes', 'accuracy'} Variable for which we plot stats. You will get an error if :meth:`Summaries.has_stat` is not true for `variable`. trim_frac : float Trim this amount of the bottom and top fraction from the data before plotting. Useful if outliers greatly extend scale. bins : int Number of histogram binds histogram_stat : {'count', 'density'} Plot the count of CCSs or their density normalized for each run. maxcol : None or int Max number of columns in faceted plot. panelsize : float Size of each plot panel. Returns ------- plotnine.ggplot.ggplot A panel of histograms. """ df = (self.ccs_stats(variable) .assign(lower=lambda x: x[variable].quantile(trim_frac), upper=lambda x: x[variable].quantile(1 - trim_frac), trim=lambda x: ((x[variable] > x['upper']) | (x[variable] < x['lower'])) ) .query('not trim') ) npanels = len(df['name'].unique()) if maxcol is None: ncol = npanels else: ncol = min(maxcol, npanels) nrow = math.ceil(npanels / ncol) p = (p9.ggplot(df, p9.aes(variable, y=f"..{histogram_stat}..")) + p9.geom_histogram(bins=bins) + p9.facet_wrap('~ name', ncol=ncol) + p9.theme(figure_size=(panelsize * ncol, panelsize * nrow), axis_text_x=p9.element_text(angle=90, vjust=1, hjust=0.5) ) + p9.ylab('number of CCSs') ) return p
def plot_seeds(df: pd.DataFrame, sweep_vars: Sequence[str] = None, num_episodes: int = NUM_EPISODES) -> gg.ggplot: """Plot the returns through time individually by run.""" return deep_sea_analysis.plot_seeds( df_in=df, sweep_vars=sweep_vars, yintercept=np.exp(-1), num_episodes=num_episodes, ) + gg.ylab('average episodic return (excluding additive noise)')
def plot_scores(df, title=None, xlab=None, ylab=None): g = (gg.ggplot(df, gg.aes(x=cfg.SCORE_COLNAME_X, y=cfg.SCORE_COLNAME_Y)) + gg.geom_line()) if title is not None: g += gg.ggtitle(title) if xlab is not None: g += gg.xlab(xlab) if ylab is not None: g += gg.ylab(ylab) return g
def plot_scaling_log(plt_df: pd.DataFrame, sweep_vars: Optional[Sequence[str]] = None, with_baseline=True) -> gg.ggplot: """Plot scaling of learning time against exponential baseline.""" p = _base_scaling(plt_df, sweep_vars, with_baseline) p += gg.scale_x_log10(breaks=[5, 10, 20, 50]) p += gg.scale_y_log10(breaks=[100, 300, 1000, 3000, 10000, 30000]) p += gg.xlab('deep sea problem size (log scale)') p += gg.ylab('#episodes until < 90% bad episodes (log scale)') return plotting.facet_sweep_plot(p, sweep_vars)
def ensemble_plot(experiment_name='ensemble_nn', data_path=_DEFAULT_DATA_PATH): """Specialized plotting script for TS tutorial paper ensemble NN.""" df = load_data(experiment_name, data_path) plt_df = (df.groupby(['agent', 't']).agg({ 'instant_regret': np.mean }).reset_index()) def _get_agent_family(agent_name): if 'dropout' in agent_name.lower(): return 'Dropout' elif 'ensemble' in agent_name.lower(): return 'Ensemble' elif '/' in agent_name.lower(): return 'Annealing epsilon' else: return 'Fixed epsilon' def _rename_ensemble(agent_name): if 'ensemble' in agent_name: n_ensemble = agent_name.split('-')[0] new_name = 'ensemble=' + n_ensemble.zfill(3) return new_name else: return agent_name plt_df['agent_name'] = plt_df.agent.apply(_rename_ensemble) plt_df['agent_family'] = plt_df.agent.apply(_get_agent_family) custom_colors = ['#d53e4f', '#fdae61', '#a6d96a', '#66c2a5', '#5e4fa2'] plot_dict = {} for agent_family, df_family in plt_df.groupby(['agent_family']): if agent_family == 'Ensemble': custom_labels = [ 'Ensemble 3', 'Ensemble 10', 'Ensemble 30', 'Ensemble 100', 'Ensemble 300' ] gg_legend = gg.scale_colour_manual(values=custom_colors, labels=custom_labels, name='Agent') else: gg_legend = gg.scale_colour_manual(custom_colors, name='Agent') p = (gg.ggplot(df_family) + gg.aes('t', 'instant_regret', colour='agent_name') + gg.geom_line(size=1.25, alpha=0.75) + gg.facet_wrap('~ agent_family') + gg_legend + gg.coord_cartesian(ylim=(0, 60)) + gg.xlab('Timestep (t)') + gg.ylab('Average instantaneous regret') + gg.theme(figure_size=(6, 6))) plot_dict[experiment_name + '_' + agent_family] = p return plot_dict
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="pattern", y="count", label="fraction")) + geom_bar(stat="identity", fill="#1e4f79") + geom_text(va='bottom', size=24, format_string='{:.1%}') + scale_x_discrete(limits=self._data["pattern"]) + scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) + ggtitle("Design Pattern Counts") + xlab("Design Pattern") + ylab("Count") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_text_x=element_text(rotation=45, ha="right"))).save( file_path, width=24, height=8)
def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy') ) else: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity') )
def yoy_growth(): """ This creates figures showing the number of questions versus year in dataset """ with open('data/external/datasets/qanta.mapped.2018.04.18.json') as f: year_pages = defaultdict(set) year_questions = Counter() for q in json.load(f)['questions']: if q['page'] is not None: year_pages[q['year']].add(q['page']) year_questions[q['year']] += 1 start_year = min(year_pages) # 2017 is the earlier year we have a full year's worth of data, including partial 2018 isn't accurate end_year = min(2017, max(year_pages)) upto_year_pages = defaultdict(set) upto_year_questions = Counter() for upto_y in range(start_year, end_year + 1): for curr_y in range(start_year, upto_y + 1): upto_year_questions[upto_y] += year_questions[curr_y] for page in year_pages[curr_y]: upto_year_pages[upto_y].add(page) year_page_counts = {} for y, pages in upto_year_pages.items(): year_page_counts[y] = len(pages) year_page_counts year_rows = [] for y, page_count in year_page_counts.items(): year_rows.append({'year': y, 'value': page_count, 'Quantity': 'Distinct Answers'}) year_rows.append({'year': y, 'Quantity': 'Total Questions', 'value': upto_year_questions[y]}) year_df = pd.DataFrame(year_rows) count_cat = CategoricalDtype(categories=['Total Questions', 'Distinct Answers'], ordered=True) year_df['Quantity'] = year_df['Quantity'].astype(count_cat) eprint(year_df[year_df.Quantity == 'Total Questions']) p = ( ggplot(year_df) + aes(x='year', y='value', color='Quantity') + geom_line() + geom_point() + xlab('Year') + ylab('Count up to Year (inclusive)') + theme_fs() + scale_x_continuous(breaks=list(range(start_year, end_year + 1, 2))) ) p.save(path.join(output_path, 'question_answer_counts.pdf'))
def accPlot(accsByNFeats): plotdata = [] for s in accsByNFeats: plotdata.append(pd.concat([DataFrame({"p" : p, "acc" : accsByNFeats[s][p], "set" : s}, index = [str(p)]) for p in accsByNFeats[s]], axis = 0)) ggd = pd.concat(plotdata) ggd['acc'] = ggd['acc'].astype(float) ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set')) ggo += gg.geom_line(alpha=0.5) ggo += gg.geom_point() ggo += gg.theme_bw() ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000]) ggo += gg.scale_color_manual(values=['darkgray', 'black', 'red', 'dodgerblue']) ggo += gg.ylab('Accuracy (5-fold CV)') print(ggo)
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = ( ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text( aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False ) + geom_segment( aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black' ) + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, ) ) return plt
def syntactic_diversity_plots(): with open('data/external/syntactic_diversity_table.json') as f: rows = json.load(f) parse_df = pd.DataFrame(rows) parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses'] melt_df = pd.melt( parse_df, id_vars=['dataset', 'depth', 'overlap', 'parses'], value_vars=['parse_ratio', 'unique_parses'], var_name='metric', value_name='y' ) def label_facet(name): if name == 'parse_ratio': return 'Average Unique Parses per Instance' elif name == 'unique_parses': return 'Count of Unique Parses' def label_y(ys): formatted_ys = [] for y in ys: y = str(y) if y.endswith('000.0'): formatted_ys.append(y[:-5] + 'K') else: formatted_ys.append(y) return formatted_ys p = ( ggplot(melt_df) + aes(x='depth', y='y', color='dataset') + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet) + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('') + scale_color_discrete(name='Dataset') + scale_y_continuous(labels=label_y) + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'syn_div_plot.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='unique_parses', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Count of Unique Parses') + scale_color_discrete(name='Dataset') + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'n_unique_parses.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='parse_ratio', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Average Unique Parses per Instance') + scale_color_discrete(name='Dataset') + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + scale_y_continuous(limits=[0, 1]) + theme_fs() ) p.save(path.join(output_path, 'parse_ratio.pdf'))
plt.ion() import RestrictedData xnorms = RestrictedData.xnorms annots = RestrictedData.annots tsne = TSNE(n_components=2, verbose=1, perplexity=10, method='barnes_hut', angle=0.5, init='pca', early_exaggeration=12, learning_rate=200, n_iter=1000, random_state=123) tsneResults = tsne.fit_transform(xnorms['shen'].values) ggd = pd.DataFrame({'sample' : xnorms['shen'].index, 'system' : annots['shen'].reindex(xnorms['shen'].index)['System'], 'coord1' : tsneResults[:, 0], 'coord2' : tsneResults[:, 1]}) plt.close() ggo = gg.ggplot(ggd, gg.aes(x='coord1', y='coord2', color='system', label='sample')) ggo += gg.geom_point() ggo += gg.geom_text(nudge_y=9, show_legend=False) ggo += gg.scale_color_manual(values=['firebrick', 'goldenrod', 'lightseagreen', 'darkorchid', 'darkslategray', 'dodgerblue']) ggo += gg.theme_bw() ggo += gg.xlab('tSNE coordinate 1') ggo += gg.ylab('tSNE coordinate 2') print(ggo)
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay['control_correct_positions'] control_wrong_positions = gameplay['control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0] control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x}) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay['advneural_correct_positions'] adv_wrong_positions = gameplay['advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = ( p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') ) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5}) ) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions') ) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth(method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )