def test_jitterdodge(): df = pd.DataFrame({ 'x': np.ones(n*2), 'y': np.repeat(np.arange(n), 2), 'letters': np.repeat(list(string.ascii_lowercase[:n]), 2)}) position = position_jitterdodge(random_state=random_state) p = (ggplot(df, aes('x', 'y', fill='letters')) + geom_point(size=10, fill='black') + geom_point(size=10, position=position)) assert p + _theme == 'jitterdodge'
def test_position_from_geom(): geom = geom_point(position='jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position='position_jitter') assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter()) assert isinstance(position.from_geom(geom), position_jitter) geom = geom_point(position=position_jitter) assert isinstance(position.from_geom(geom), position_jitter)
def test_no_fill(): df = pd.DataFrame({'x': range(5), 'y': range(5)}) p = (ggplot(df, aes('x', 'y')) + geom_point(color='red', fill=None, size=5, stroke=1.5) + geom_point(aes(y='y+1'), color='blue', fill='none', size=5, stroke=1.5) + geom_point(aes(y='y+2'), color='green', fill='', size=5, stroke=1.5) + geom_point(aes(y='y+3'), color='yellow', fill='gray', size=5, stroke=1.5)) assert p == 'no_fill'
def test_aesthetics(): df = pd.DataFrame({ 'a': range(5), 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9 }) p = (ggplot(df, aes(y='a')) + geom_point(aes(x='b')) + geom_point(aes(x='c', size='a')) + geom_point(aes(x='d', alpha='a'), size=10, show_legend=False) + geom_point(aes(x='e', shape='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='f', color='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='g', fill='a'), stroke=0, size=10, show_legend=False) + geom_point(aes(x='h', stroke='a'), fill='white', color='green', size=10) + geom_point(aes(x='i', shape='factor(a)'), fill='brown', stroke=2, size=10, show_legend=False) + theme(subplots_adjust={'right': 0.85})) assert p == 'aesthetics'
def test_bool_mapping(): df = pd.DataFrame({ 'x': [1, 2, 3], 'y': [True, False, False] }) p = ggplot(df, aes('x', 'y')) + geom_point() assert p == 'bool_mapping'
def test_continuous_x(): n = len(df_continuous_x) p = (ggplot(df_continuous_x, aes('x', 'y')) + geom_point() + geom_smooth(df_continuous_x[3:n-3], method='loess', color='blue', fullrange=False)) assert p == 'continuous_x'
def test_legend_fill_ratio(): p = (ggplot(df_linear, aes('x', color='x<0.5')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', size=0.5, span=.3) ) assert p == 'legend_fill_ratio'
def test_expand_limits(): df = pd.DataFrame({'x': range(5, 11), 'y': range(5, 11)}) p = (ggplot(aes('x', 'y'), data=df) + geom_point() + expand_limits(y=(0, None)) ) assert p == 'expand_limits'
def test_hull(): p = (ggplot(mtcars) + aes('wt', 'mpg', color='factor(cyl)') + geom_point() + stat_hull(size=1) ) assert p + _theme == 'hull'
def test_aes_inheritance(): # A default line (intercept = 0, slope = 1) p = (ggplot(df, aes('x', 'y', color='factor(z)', slope='slope', intercept='intercept')) + geom_point(size=10, show_legend=False) + geom_abline(size=2)) assert p == 'aes_inheritance'
def test_non_linear_smooth_no_ci(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='loess', span=.3, color='blue', se=False) ) assert p == 'non_linear_smooth_no_ci'
def test_linear_smooth(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', span=.3, color='blue') ) assert p == 'linear_smooth'
def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def test_scale_without_a_mapping(): df = pd.DataFrame({ 'x': [1, 2, 3], }) p = (ggplot(df, aes('x', 'x')) + geom_point() + scale_color.scale_color_continuous()) with pytest.warns(UserWarning): p.draw_test()
def test_ellipse(): p = (ggplot(df, aes('x', 'y')) + geom_point() + stat_ellipse(type='t') + stat_ellipse(type='norm', color='red') + stat_ellipse(type='euclid', color='blue') ) assert p == 'ellipse'
def test_points(): p = (p0 + geom_point( aes(fill='calc(density)', size='calc(density)'), stat='density_2d', stroke=0, n=16, contour=False) + scale_size_radius(range=(0, 6))) assert p == 'points'
def test_lines(): p = (ggplot(df, aes(x='x', y='y')) + geom_point(alpha=.5) + geom_quantile(quantiles=[.001, .5, .999], formula='y~x', size=2)) # Two (.001, .999) quantile lines should bound the points # from below and from above, and the .5 line should go # through middle (approximately). assert p == 'lines'
def test_addition(self): p = ggplot(df, aes('x', 'y')) p1 = p + self.lyrs[0] + self.lyrs[1] + self.lyrs[2] assert _get_colors(p1) == colors p2 = p + self.lyrs assert _get_colors(p2) == colors # Real layers lyrs = Layers(layer.from_geom(obj) for obj in self.lyrs) p3 = p + lyrs assert _get_colors(p3) == colors p += self.lyrs assert _get_colors(p) == colors with pytest.raises(PlotnineError): geom_point() + layer.from_geom(geom_point()) with pytest.raises(PlotnineError): geom_point() + self.lyrs
def test_watermark(): dir_path = os.path.dirname(os.path.realpath(__file__)) filename = os.path.join(dir_path, 'images/plotnine-watermark.png') df = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]}) p = (ggplot(df) + geom_point(aes('x', 'y')) + watermark(filename, 150, 160) + watermark(filename, 150, 210, 0.5) ) assert p == 'watermark'
def test_jitter(): df1 = pd.DataFrame({'x': [1, 2, 1, 2], 'y': [1, 1, 2, 2]}) p = (ggplot(df1, aes('x', 'y')) + geom_point(size=10) + geom_jitter(size=10, color='red', random_state=random_state) + geom_jitter(size=10, color='blue', width=0.1, height=0.1, random_state=random_state)) assert p + _theme == 'jitter' with pytest.raises(PlotnineError): geom_jitter(position=position_jitter(), width=0.1)
def test_multiple_annotation_geoms(): p = (ggplot(df, aes('x', 'y')) + geom_point() + annotate('point', 0, 1, color='red', size=5) + annotate('text', 1, 2, label='Text', color='red', size=15, angle=45) + annotate('rect', xmin=1.8, xmax=2.2, ymin=2.8, ymax=3.2, size=1, color='red', alpha=0.3) + annotate('segment', x=2.8, y=3.8, xend=3.2, yend=4.2, color='red', size=1)) assert p == 'multiple_annotation_geoms'
def test_aesthetics(): p = (ggplot(df) + geom_point(aes('x', 'y')) + geom_hline(aes(yintercept='yintercept'), size=2) + geom_hline(aes(yintercept='yintercept+.1', alpha='z'), size=2) + geom_hline(aes(yintercept='yintercept+.2', linetype='factor(z)'), size=2) + geom_hline(aes(yintercept='yintercept+.3', color='factor(z)'), size=2) + geom_hline(aes(yintercept='yintercept+.4', size='z'))) assert p + _theme == 'aesthetics'
def yoy_growth(): """ This creates figures showing the number of questions versus year in dataset """ with open('data/external/datasets/qanta.mapped.2018.04.18.json') as f: year_pages = defaultdict(set) year_questions = Counter() for q in json.load(f)['questions']: if q['page'] is not None: year_pages[q['year']].add(q['page']) year_questions[q['year']] += 1 start_year = min(year_pages) # 2017 is the earlier year we have a full year's worth of data, including partial 2018 isn't accurate end_year = min(2017, max(year_pages)) upto_year_pages = defaultdict(set) upto_year_questions = Counter() for upto_y in range(start_year, end_year + 1): for curr_y in range(start_year, upto_y + 1): upto_year_questions[upto_y] += year_questions[curr_y] for page in year_pages[curr_y]: upto_year_pages[upto_y].add(page) year_page_counts = {} for y, pages in upto_year_pages.items(): year_page_counts[y] = len(pages) year_page_counts year_rows = [] for y, page_count in year_page_counts.items(): year_rows.append({'year': y, 'value': page_count, 'Quantity': 'Distinct Answers'}) year_rows.append({'year': y, 'Quantity': 'Total Questions', 'value': upto_year_questions[y]}) year_df = pd.DataFrame(year_rows) count_cat = CategoricalDtype(categories=['Total Questions', 'Distinct Answers'], ordered=True) year_df['Quantity'] = year_df['Quantity'].astype(count_cat) eprint(year_df[year_df.Quantity == 'Total Questions']) p = ( ggplot(year_df) + aes(x='year', y='value', color='Quantity') + geom_line() + geom_point() + xlab('Year') + ylab('Count up to Year (inclusive)') + theme_fs() + scale_x_continuous(breaks=list(range(start_year, end_year + 1, 2))) ) p.save(path.join(output_path, 'question_answer_counts.pdf'))
def fit_curve(self): df, questions = load_protobowl() # convert prompt to false df.result = df.result.apply(lambda x: x is True) xy = list(zip(df.relative_position.tolist(), df.result.tolist())) xy = sorted(xy, key=lambda x: x[0]) ratios = dict() cnt = 0 for x, y in xy: x = int(x*1000) ratios[x] = cnt cnt += y ratios = sorted(ratios.items(), key=lambda x: x[0]) ratios = [(x / 1000, y) for x, y in ratios] ttl_correct = df.result.tolist().count(True) ttl_correct = len(xy) curve = [(x, 1 - y / ttl_correct) for x, y in ratios] X, y = list(map(list, zip(*curve))) X = np.asarray(X) y = np.asarray(y) degree = 3 polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X[:, np.newaxis], y) print(pipeline.steps[1][1].coef_) def get_weight(x): return pipeline.predict(np.asarray([[x]]))[0] ddf = pd.DataFrame({'x': X, 'y': y}) p0 = ggplot(ddf, aes(x='x', y='y')) \ + geom_point(size=0.3, color='blue', alpha=0.5, shape='+') \ + stat_function(fun=get_weight, color='red', size=2, alpha=0.5) \ + labs(x='Position', y='Weight') p0.save('output/reporting/curve_score.pdf') p0.draw() return pipeline
def accPlot(accsByNFeats): plotdata = [] for s in accsByNFeats: plotdata.append(pd.concat([DataFrame({"p" : p, "acc" : accsByNFeats[s][p], "set" : s}, index = [str(p)]) for p in accsByNFeats[s]], axis = 0)) ggd = pd.concat(plotdata) ggd['acc'] = ggd['acc'].astype(float) ggo = gg.ggplot(ggd, gg.aes(x='p', y='acc', color='set')) ggo += gg.geom_line(alpha=0.5) ggo += gg.geom_point() ggo += gg.theme_bw() ggo += gg.scale_x_log10(breaks=[10, 100, 1000, 10000]) ggo += gg.scale_color_manual(values=['darkgray', 'black', 'red', 'dodgerblue']) ggo += gg.ylab('Accuracy (5-fold CV)') print(ggo)
def fitted_actual(self, figure_size=(4, 4), sample_frac=1.0): """Plot fitted values against actual values Parameters ---------- figure_size : tuple(int, int), optional default=(4, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ return ( ggplot(self.df.sample(frac=sample_frac), aes(x="y", y="yhat")) + geom_point(alpha=0.25) + geom_abline(slope=1, intercept=0, color="red", linetype="dashed") + labs(title="Fitted vs Actual (R2 = {:.3f})".format(self.r2_score), x="Actual", y="Fitted") + theme(figure_size=figure_size))
def plot(self, pc1=1, pc2=2, selection=None, color='cell type', shape=None, filter_on='cell type', alpha=.8, bl_rm=False, data=None): if data is None: data = self.plot_data if selection is not None: ind = [x in selection for x in data[filter_on]] data = data.loc[ind, ] if bl_rm is True: data = data.loc[~self.blacklisted, ] pl = pn.ggplot(pn.aes('PC ' + str(pc1), 'PC ' + str(pc2), color=color), data) + pn.geom_point(alpha=alpha) if shape is not None: pl = pl + pn.aes(shape=shape) return pl
def plot_scale(df: pd.DataFrame, sweep_vars: Sequence[Text] = None) -> gg.ggplot: """Plots the best episode observed by height_threshold.""" df = cp_swingup_preprocess(df_in=df) group_vars = ['height_threshold'] if sweep_vars: group_vars += sweep_vars plt_df = df.groupby(group_vars)['best_episode'].max().reset_index() p = ( gg.ggplot(plt_df) + gg.aes(x='factor(height_threshold)', y='best_episode', colour='best_episode > {}'.format(GOOD_EPISODE)) + gg.geom_point(size=5, alpha=0.8) + gg.scale_colour_manual(values=['#d73027', '#313695']) + gg.geom_hline(gg.aes(yintercept=0.0), alpha=0) # axis hack + gg.scale_x_discrete(breaks=[0, 0.25, 0.5, 0.75, 1.0]) + gg.ylab('best return in first {} episodes'.format(NUM_EPISODES)) + gg.xlab('height threshold')) return plotting.facet_sweep_plot(p, sweep_vars)
class TestOther(object): p = ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) def test_wls(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='wls') p.draw_test() def test_rlm(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='rlm') with pytest.warns(UserWarning): p.draw_test() def test_glm(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='glm') p.draw_test() def test_gls(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='gls') p.draw_test() def test_lowess(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='lowess') with pytest.warns(UserWarning): p.draw_test() def test_mavg(self): p = self.p + geom_smooth( aes(y='y_noisy'), method='mavg', method_args={'window': 10}) p.draw_test() def test_gpr(self): try: from sklearn import gaussian_process # noqa:401 except ImportError: return p = self.p + geom_smooth(aes(y='y_noisy'), method='gpr') p.draw_test()
def qq_plot(self, figure_size=(6, 4), sample_frac=1.0): """QQ plot of residuals Parameters ---------- figure_size : tuple(int, int), optional default=(6, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ # Normal distribution quantiles q = stats.norm.ppf([(x + 1) / (len(self.y) + 1) for x in range(len(self.y))]) # Get gradient and intercept of QQ line r_quantiles = np.quantile(self.df.residual, [0.25, 0.75]) norm_quantiles = stats.norm.ppf([0.25, 0.75]) qq_grad = (r_quantiles[1] - r_quantiles[0]) / (norm_quantiles[1] - norm_quantiles[0]) qq_int = r_quantiles[0] - qq_grad * norm_quantiles[0] # data frame to hold the plot data qq = pd.DataFrame(zip(self.df.residual.sort_values(ascending=True), q), columns=["x", "norm_q"]) return ( ggplot(qq.sample(frac=sample_frac), aes(x="norm_q", y="x")) + geom_point(alpha=0.25) + geom_abline(intercept=qq_int, slope=qq_grad, color="red", linetype="dashed") + labs(title="QQ Plot", x="Normal Quantiles", y="Sample Quantiles") + theme(figure_size=figure_size))
def generate_scatter_plots( data, x="pca1", y="pca2", nsample=200, random_state=100, selected_categories=['bioinformatics', 'neuroscience'], color_palette=['#a6cee3', '#1f78b4'], save_file_path="output/pca_plots/scatterplot_files/pca01_v_pca02.png"): g = (p9.ggplot( data.query(f"category in {selected_categories}").groupby("category"). apply(lambda x: x.sample(nsample, random_state=random_state) if len(x) > nsample else x).reset_index(drop=True)) + p9.aes(x=x, y=y, color="factor(category)") + p9.geom_point() + p9.scale_color_manual({ category: color for category, color in zip(selected_categories, color_palette) }) + p9.labs(title="PCA of BioRxiv (Word Dim: 300)", color="Article Category") + p9.theme(figure_size=(4.59, 3.44), dpi=300)) g.save(save_file_path) print(g)
def ikuya_sys_plot(): nips_df = load_ikuya_nips() with open('2019_tacl_trick/data/ikuya_cdf.json') as f: df = pd.DataFrame(json.load(f)) df = pd.concat([df, nips_df]) df['model'] = df['model'].map(relabel) model_dtype = CategoricalDtype( ['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) df['model'] = df['model'].astype(model_dtype) p = ( ggplot(df) + aes(x='x', y='y', color='model', xmin='x', xmax='x') + geom_point(size=1.0, shape='.') + xlab('Percent of Question Revealed') + ylab('Accuracy') + scale_y_continuous(breaks=np.linspace(0, 1, 6), limits=[0, 1]) + theme( legend_position=(.335, .7), legend_background=element_blank( ), #element_rect(alpha=1, fill='#EEEFEE', color='white'), #legend_key=element_rect(alpha=0), legend_box_margin=0, legend_title=element_blank())) p.save('2019_tacl_trick/auto_fig/ikuya_cdf.pdf', width=3.5, height=2.5)
def test_coord_trans(): df = pd.DataFrame({ 'x': range(10), 'y': range(10) }) rdf = pd.DataFrame({ 'xmin': [3], 'xmax': 7, 'ymin': -np.inf, 'ymax': np.inf, }) p = (ggplot(df, aes('x', 'y')) + geom_point() + geom_rect( data=rdf, mapping=aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax'), alpha=0.2, inherit_aes=False) + coord_trans() ) assert p == 'coord-trans'
def go_to_time_plot1(go_to_time_probs_new: list, go_to_time_probs_old: list, average_minutes_per_game_values: list): """ Plot go-to-time probability, new vs. old rules, no blowouts, 85 matches/round """ time_prob_data = pd.DataFrame({ 'Average minutes per game': np.concatenate( [average_minutes_per_game_values, average_minutes_per_game_values]), 'P(Go to time)': np.concatenate([go_to_time_probs_new, go_to_time_probs_old]), 'Rules': np.concatenate([ np.repeat('New', len(average_minutes_per_game_values)), np.repeat('Old', len(average_minutes_per_game_values)) ]) }) (plt.ggplot( time_prob_data, plt.aes(x='Average minutes per game', y='P(Go to time)', color='Rules')) + plt.geom_line() + plt.geom_point() + plt.ylim([0, 1]) + plt.theme_classic()).save(filename='figures/go_to_time_prob_plot.png')
def ggfuntile(f, d, xrng=(0, 1), yrng=(0, 1), limits=(0, 1), density=51, xlab="x", ylab="y", zlab="f", breaks=None, **kwargs): od = OrderedDict() od[xlab] = np.arange(xrng[0], xrng[1], (xrng[1] - xrng[0]) / (density - 1.0)) od[ylab] = np.arange(yrng[0], yrng[1], (yrng[1] - yrng[0]) / (density - 1.0)) ggdata = expandGrid(od) ggdata["z"] = [ f(ggdata.iloc[i, 0], ggdata.iloc[i, 1]) for i in range(ggdata.shape[0]) ] gg = ggplot(ggdata, aes(x=xlab, y=ylab)) gg += geom_tile(aes(fill="z")) gg += scale_fill_gradientn(colors=[ "black", "#202020", "#404040", "#808080", "white", "dodgerblue", "blue", "darkblue", "midnightblue" ], name=zlab, limits=limits) gg += theme_classic() gg += geom_point(data=d, mapping=aes(shape="class"), color="red", size=2, alpha=0.8) gg += scale_shape_manual(values=["x", "^"]) return gg
def create_highlighted_scatter_plot(gwas_df): """This function creates a scatter plot with certain points highlighted. Doing so will require modifying gwas_df to reflect which points should be highlighted, then actually creating the scatter plot Inputs ------ gwas_df: pandas.DataFrame A dataframe containing information from a genome-wide association study Return ------ plot: plotnine.ggplot The scatter plot representing the GWAS data """ snps_of_interest = [ 'rs12752601', 'rs117018967', 'rs188695075', 'rs6604965', 'rs542289952' ] gwas_df['snp_of_interest'] = gwas_df['rsid'].isin(snps_of_interest) return ggplot(gwas_df, aes(x='Position_hg19', y='neg_log_p', color='snp_of_interest'))\ + geom_point()\ + geom_hline(yintercept=6, color='red')
def lineplot_celldiv_moment(adata): """ Plots total_counts as a function of the principal circle nodes to visualize the moment of cell division. Parameters ---------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.celldiv_moment`. Returns ------------ A plotnine line-plot to help visualize the moment of cell division and direction of the cell cycle. The moment of cell division is defined by the largest drop in total_counts. The changes in counts are represented by the bars at the bottom, and the suggested moment of cell division is marked in red. The cell cycle should follow an incremental increase in total counts until around the moment of cell division. """ edge_to_0 = adata.uns['scycle']['cell_div_moment']['cell_div_edge'][0] edges = adata.uns['princirc_gr']['edges'] edges['cell_div'] = edges['e1'] == edge_to_0 ref_var = adata.uns['scycle']['cell_div_moment']['ref_var'] cell_div_count = edges[edges['e1'] == edge_to_0]['mean_var'] cell_div_plot = ( ggplot(edges, aes('e1', 'mean_var')) + geom_point(aes(y='mean_var'), size=2) + geom_path(aes(y='mean_var')) + geom_smooth(aes(y='mean_var'), method='lm', linetype='dashed') + annotate("point", x=edge_to_0, y=cell_div_count, color='red', size=2) + labs(x='Edge position', y=ref_var) + geom_col(aes(y='diff_var', fill='cell_div')) + scale_fill_manual(values=['darkgrey', 'red'], guide=False) + theme_std) return cell_div_plot
input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded, index=normalized_compendium.index, columns=['1','2']) # Add label input_data_UMAPencoded_df['dataset'] = 'training' input_data_UMAPencoded_df.loc[val_samples,'dataset'] = 'validation' input_data_UMAPencoded_df # In[12]: # Plot fig = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) fig += geom_point(aes(color='dataset'), alpha=0.2) fig += labs(x ='UMAP 1', y = 'UMAP 2', title = 'UMAP of normalized compendium') fig += theme_bw() fig += theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), legend_title=element_text(family='sans-serif', size=15), legend_text=element_text(family='sans-serif', size=12), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) fig += guides(colour=guide_legend(override_aes={'alpha': 1}))
.reset_index(drop=True) ), } # # UMAP of the Documents # This section is to highlight the differences between embedding models using UMAP. # The three models being compared are: # 1. initialized Word2Vec Model # 2. Doc2vec model # 3. Pretrained Word2Vec Model - first trained on Google news dataset 300 dim, then trained on bioRxiv g = ( p9.ggplot(biorxiv_umap_models_latest["original"]) + p9.aes(x="umap1", y="umap2", color="factor(category)") + p9.geom_point() + p9.labs(title="UMAP of BioRxiv (Word dim: 300)", color="Article Category") ) g.save("output/embedding_output/umap/figures/biorxiv_umap_300.png", dpi=500) print(g) g = ( p9.ggplot(biorxiv_umap_models_latest["doc2vec"]) + p9.aes(x="umap1", y="umap2", color="factor(category)") + p9.geom_point() + p9.labs(title="UMAP of BioRxiv (Doc2vec Word dim: 300)", color="Article Category") ) g.save("output/embedding_output/umap/figures/biorxiv_umap_300_doc2vec.png", dpi=500) print(g) g = (
x['k'], x['resubAccuracy'], x['testAccuracy']) for x in repeatedKnnResults], columns = ['p', 'k', 'resubAccuracy', 'testAccuracy']) ggdata = pd.concat( [DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'resub', 'Accuracy' : knnResultsSimplified.resubAccuracy}), DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'test', 'Accuracy' : knnResultsSimplified.testAccuracy})], axis = 0 ) plt.close() ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy', color='type', group='type', linetype='type')) ggo += gg.facet_wrap('~ k') ggo += gg.scale_x_log10() ggo += gg.geom_point(alpha=0.6) ggo += gg.stat_smooth() ggo += gg.theme_bw() print(ggo)
def ologram_merge_stats(inputfiles=None, pdf_width=None, pdf_height=None, output=None, labels=None): # ------------------------------------------------------------------------- # Check user provided labels # ------------------------------------------------------------------------- if labels is not None: labels = labels.split(",") for elmt in labels: if not re.search("^[A-Za-z0-9_]+$", elmt): message( "Only alphanumeric characters and '_' allowed for --more-bed-labels", type="ERROR") if len(labels) != len(inputfiles): message("--labels: the number of labels should be" " the same as the number of input files ", type="ERROR") if len(labels) != len(set(labels)): message("Redundant labels not allowed.", type="ERROR") # ------------------------------------------------------------------------- # Loop over input files # ------------------------------------------------------------------------- df_list = list() df_label = list() for pos, infile in enumerate(inputfiles): message("Reading file : " + infile.name) # Read the dataset into a temporay dataframe df_tmp = pd.read_csv(infile, sep='\t', header=0, index_col=None) # Change name of 'feature_type' column. df_tmp = df_tmp.rename(index=str, columns={"feature_type": "Feature"}) # Assign the name of the dataset to a new column if labels is None: file_short_name = os.path.basename(os.path.normpath(os.path.dirname(infile.name))) df_label += [file_short_name] else: file_short_name = labels[pos] df_label += [labels[pos]] df_tmp = df_tmp.assign(**{"dataset": [file_short_name] * df_tmp.shape[0]}) # Pval set to 0 or -1 are changed to 1e-320 and NaN respectively df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320 df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan # Compute -log10(pval) df_tmp = df_tmp.assign(**{"-log_10(pval)": -np.log10(df_tmp.summed_bp_overlaps_pvalue)}) # Which p-values are signifcant ? # TODO: For now, draws all p-values. Add Benjamini-Hochberg correction, and distinguish between NaN and 0. df_tmp = df_tmp.assign(**{"pval_signif": df_tmp.summed_bp_overlaps_pvalue > 0}) # Add the df to the list to be subsequently merged df_list += [df_tmp] if len(set(df_label)) < len(df_label): message('Enclosing directories are ambiguous and cannot be used as labels. You may use "--labels".', type="ERROR") # ------------------------------------------------------------------------- # Concatenate dataframes (row bind) # ------------------------------------------------------------------------- message("Merging dataframes.") df_merged = pd.concat(df_list, axis=0) # ------------------------------------------------------------------------- # Plotting # ------------------------------------------------------------------------- message("Plotting") my_plot = ggplot(data=df_merged, mapping=aes(y='Feature', x='dataset')) my_plot += geom_tile(aes(fill = 'summed_bp_overlaps_log2_fold_change')) my_plot += scale_fill_gradient2() my_plot += labs(fill = "log2(fold change) for summed bp overlaps") # Points for p-val. Must be after geom_tile() my_plot += geom_point(data = df_merged.loc[df_merged['pval_signif']], mapping = aes(x='dataset',y='Feature',color = '-log_10(pval)'), size=4, shape ='D', inherit_aes = False) my_plot += scale_color_gradientn(colors = ["#160E00","#FFB025","#FFE7BD"]) my_plot += labs(color = "-log10(p-value)") # Theming my_plot += theme_bw() my_plot += theme(panel_grid_major=element_blank(), axis_text_x=element_text(rotation=90), panel_border=element_blank(), axis_ticks=element_blank()) # ------------------------------------------------------------------------- # Saving # ------------------------------------------------------------------------- message("Saving") nb_ft = len(list(df_merged['Feature'].unique())) nb_datasets = len(list(df_merged['dataset'].unique())) if pdf_width is None: panel_width = 0.6 pdf_width = panel_width * nb_datasets if pdf_width > 100: pdf_width = 100 message("Setting --pdf-width to 100 (limit)") if pdf_height is None: panel_height = 0.6 pdf_height = panel_height * nb_ft if pdf_height > 500: pdf_height = 500 message("Setting --pdf-height to 500 (limit)") message("Page width set to " + str(pdf_width)) message("Page height set to " + str(pdf_height)) figsize = (pdf_width, pdf_height) # ------------------------------------------------------------------------- # Turn warning off. Both pandas and plotnine use warnings for deprecated # functions. I need to turn they off although I'm not really satisfied with # this solution... # ------------------------------------------------------------------------- def fxn(): warnings.warn("deprecated", DeprecationWarning) # ------------------------------------------------------------------------- # Saving # ------------------------------------------------------------------------- with warnings.catch_warnings(): warnings.simplefilter("ignore") fxn() message("Saving diagram to file : " + output.name) message("Be patient. This may be long for large datasets.") # NOTE : We must manually specify figure size with save_as_pdf_pages save_as_pdf_pages(filename=output.name, plots=[my_plot + theme(figure_size=figsize)], width=pdf_width, height=pdf_height)
import pandas as pd import pytest from plotnine import ggplot, aes, geom_point, facet_grid, facet_wrap from plotnine import geom_abline, theme n = 10 df = pd.DataFrame({'x': range(n), 'y': range(n), 'var1': np.repeat(range(n//2), 2), 'var2': np.tile(['a', 'b'], n//2), }) df['class'] = df['var1'] # python keyword as column g = (ggplot(df, aes('x', 'y')) + geom_point(aes(color='factor(var1)'), size=5, show_legend=False)) # facet_wrap def test_facet_wrap_one_var(): p = g + facet_wrap('~var1') p2 = g + facet_wrap('~class') # python keyword in formula assert p == 'facet_wrap_one_var' assert p2 == 'facet_wrap_one_var' # https://github.com/pandas-dev/pandas/issues/16276 @pytest.mark.xfail def test_facet_wrap_expression(): p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
geom_line(size=0.5) + ylim(0, 202) + labs(x="time", y="$[S^{**}]$") + scale_color_distiller( palette='RdYlBu', type="diverging", name="$B_{tot}$") + facet_wrap('~dir') + theme_bw()) g.save(filename="./num_cont_graphs/sim_fwd_rev2.png", format="png", width=8, height=4, units='in', verbose=False) eq = out[out.time == max(out.time)] g = (ggplot(eq) + aes(x='signal', y=response, color='dir') + labs(x="$B_{tot}$", y="$[S^{**}]$", color="") + geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw() + geom_point(color="black") + annotate("point", x=plot_specifications[2][0][0], y=plot_specifications[2][0][1], colour="red", shape="*", size=3.5) + annotate("text", x=plot_specifications[2][0][0], y=plot_specifications[2][0][1], label=plot_specifications[2][0][2])) # + annotate("point", x=plot_specifications[2][1][0], y=plot_specifications[2][1][1], colour="red", shape="*", # size=3.5) # + annotate("text", x=plot_specifications[2][1][0], y=plot_specifications[2][1][1], # label=plot_specifications[2][1][2])) g.save(filename="./num_cont_graphs/sim_bif_diag2.png",
def plot(self): """Plot the figures using R""" df = pandas.DataFrame( self.data, columns=self.datacols, ) with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "): df.columns = make_unique(df.columns.tolist()) if self.savedata: datafile = self.outprefix + ".csv" logger.info( "[r]%s[/r]: Saving data to: %r", self.title, datafile, extra={"markup": True}, ) df.to_csv(datafile, index=False) if df.shape[0] == 0: logger.warning("No data points to plot") return aes_for_geom_fill = None aes_for_geom_color = None theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2)) if df.shape[1] > 2: aes_for_geom_fill = p9.aes(fill=df.columns[2]) aes_for_geom_color = p9.aes(color=df.columns[2]) plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1])) if self.figtype == "scatter": plt = plt + p9.geom_point(aes_for_geom_color) theme_elems = None elif self.figtype == "line": pass elif self.figtype == "bar": plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0])) elif self.figtype == "col": plt = plt + p9.geom_col(aes_for_geom_fill) elif self.figtype == "pie": logger.warning("Pie chart is not support by plotnine yet, " "plotting bar chart instead.") col0 = df.iloc[:, 0] if df.shape[1] > 2: plt = plt + p9.geom_bar( p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]), stat="identity" # aes_for_geom_fill, # x=df.Group, # y=col0, # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"), # show_legend=False, # position=p9.position_adjust_text(), ) else: col0 = factor(col0, levels=rev(unique(as_character(col0)))) fills = rev(levels(col0)) sums = map(lambda x: sum(col0 == x), fills) print(col0) print(fills) plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) + p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label( x=1, y=cumsum(sums) - sums / 2, label=paste0(round(sums / sum(sums) * 100, 1), "%"), show_legend=False, )) theme_elems = p9.theme( axis_title_x=p9.element_blank(), axis_title_y=p9.element_blank(), axis_text_y=p9.element_blank(), ) elif self.figtype == "violin": plt = plt + p9.geom_violin(aes_for_geom_fill) elif self.figtype == "boxplot": plt = plt + p9.geom_boxplot(aes_for_geom_fill) elif self.figtype in ("histogram", "density"): plt = p9.ggplot(df, p9.aes(x=df.columns[0])) geom = getattr(p9, f"geom_{self.figtype}") if df.columns[1] != "ONE": plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6) theme_elems = None else: plt = plt + geom(alpha=0.6) theme_elems = p9.theme(legend_position="none") elif self.figtype == "freqpoly": plt = p9.ggplot(df, p9.aes(x=df.columns[0])) if df.columns[1] != "ONE": plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1])) else: plt = plt + p9.geom_freqpoly() theme_elems = None else: raise ValueError(f"Unknown figure type: {self.figtype}") plt = plt + p9.ggtitle(self.title) self.save_plot(plt, theme_elems)
from plotnine.data import economics from plotnine import ggplot, aes, facet_grid, labs, geom_point, geom_smooth, xlab, ylab g=(ggplot(economics) + aes(x="date", y="uempmed") + geom_point() + geom_smooth(color="red", span=0.5) + xlab("date (year)") + ylab("unemploynment")) g.save("19.png")
def scatter_cell_cycle( adata, scores=["signatures", "components"][0], size=1.5, alpha=1, curvature_shrink=1, lab_ypos=2, ): """Plots cell cycle signatures vs pseudotime Parameters ---------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.cell_cycle_phase`. scores: str A string indicating what to plot as cell cycle scores against pseudotime. If 'signatures', standard S-phase, G2-M and Histones signatures are used; if 'components', the 4 cell cycle related components are used. size: float Controls the point size of the plot. alpha: float A value between 0 and 1. Controls point transparency. lab_ypos: float Controls the y-axis position of the cell cycle phase annotation. Returns -------------- A plotnine scatter plot of pseudotime vs 3 cell cycle signatures. """ if scores == "signatures": y = ["S-phase", "G2-M", "Histones"] colors = ["#66c2a5", "#fc8d62", "#8da0cb", "black"] elif scores == "components": _add_compScores(adata) y = ["G1/S comp", "G2/M+ comp", "G2/M- comp", "Histones comp"] colors = ["#66c2a5", "#fc8d62", "#8da0cb", "#e5c494", "black"] time_scatter = scatter_pseudotime( adata, y=y, size=size, alpha=alpha) + labs( x="Pseudotime", y="Signature scores", color="Signature") # -- Add cell cycle annotations if "cell_cycle_division" in adata.uns["scycle"]: cc_divs = adata.uns["scycle"]["cell_cycle_division"] # -- Curvature data curv_data = cc_divs["curvature"] curv = curv_data["curvature"].values cvz = zscore(curv) / curvature_shrink cvz = cvz - np.max(cvz) curv_data.loc[:, "curvature"] = cvz curv_data.loc[:, "signature"] = "Curvature" # -- Peak data (for segments) gr_min = np.min(curv_data["curvature"]) pk_data = curv_data[curv_data["ispeak"] == "peak"] pk_data.loc[:, "ymin"] = gr_min # -- Cell cycle annotation cc_phase = pd.DataFrame( dict( starts=[ None, cc_divs["s_start"], cc_divs["g2_start"], cc_divs["m_start"], ], labels=["G1", "S", "G2", "M"], labpos=[ np.mean([0, cc_divs["s_start"]]), np.mean([cc_divs["s_start"], cc_divs["g2_start"]]), np.mean([cc_divs["g2_start"], cc_divs["m_start"]]), np.mean([cc_divs["m_start"], 1]), ], y=lab_ypos, )) cell_cycle_plt = ( time_scatter + geom_point(aes("pseudotime", "curvature", color="signature"), data=curv_data) + geom_line(aes("pseudotime", "curvature"), data=curv_data) + scale_color_manual(values=colors) + geom_segment( aes(x="pseudotime", xend="pseudotime", y="ymin", yend="curvature"), linetype="dotted", data=pk_data, ) + geom_vline( aes(xintercept="starts"), linetype="dashed", data=cc_phase) + geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase)) return cell_cycle_plt else: return time_scatter
class TestThemes(object): g = (ggplot(mtcars, aes(x='wt', y='mpg', color='factor(gear)')) + geom_point() + facet_grid('vs ~ am')) def test_theme_538(self): p = self.g + labs(title='Theme 538') + theme_538() assert p + _theme == 'theme_538' def test_theme_bw(self): p = self.g + labs(title='Theme BW') + theme_bw() assert p + _theme == 'theme_bw' def test_theme_classic(self): p = self.g + labs(title='Theme Classic') + theme_classic() assert p + _theme == 'theme_classic' def test_theme_dark(self): p = self.g + labs(title='Theme Dark') + theme_dark() assert p + _theme == 'theme_dark' def test_theme_gray(self): p = self.g + labs(title='Theme Gray') + theme_gray() assert p + _theme == 'theme_gray' def test_theme_light(self): p = self.g + labs(title='Theme Light') + theme_light() assert p + _theme == 'theme_light' def test_theme_linedraw(self): p = self.g + labs(title='Theme Linedraw') + theme_linedraw() if six.PY2: # Small displacement in title assert p + _theme == ('theme_linedraw', {'tol': 8}) else: assert p + _theme == 'theme_linedraw' def test_theme_matplotlib(self): p = self.g + labs(title='Theme Matplotlib') + theme_matplotlib() assert p + _theme == 'theme_matplotlib' def test_theme_minimal(self): p = self.g + labs(title='Theme Minimal') + theme_minimal() assert p + _theme == 'theme_minimal' def test_theme_seaborn(self): p = self.g + labs(title='Theme Seaborn') + theme_seaborn() assert p + _theme == 'theme_seaborn' def test_theme_void(self): p = self.g + labs(title='Theme Void') + theme_void() assert p + _theme == 'theme_void' def test_theme_xkcd(self): p = self.g + labs(title='Theme Xkcd') + theme_xkcd() if os.environ.get('TRAVIS'): # Travis does not have the fonts, we still check # to catch any other errors assert p + _theme != 'theme_gray' else: assert p + _theme == 'theme_xkcd'
) # .assign( # abstract_only_distance_log10=lambda x: -np.log10(x.abstract_only_distance), # full_text_distance_log10=lambda x: -np.log10(x.full_text_distance), # ) ) plot_df.head() # Pearson's R for correlation # Shows a weak but positive correlation scipy.stats.pearsonr(plot_df.full_text_distance, plot_df.abstract_only_distance) g = ( p9.ggplot(plot_df) + p9.aes(x="full_text_distance", y="abstract_only_distance") + p9.geom_point(fill="#a6cee3") + p9.scale_y_continuous(trans="log10") + p9.labs(x="Full Text Distance", y="Abstract Only Distance") + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=1.35) ) g.save("output/figures/biorxiv_full_text_v_abstract_only.svg", dpi=250) g.save("output/figures/biorxiv_full_text_v_abstract_only.png", dpi=250) print(g) # Remove outliers for shape of distribution g = ( p9.ggplot(plot_df.query("abstract_only_distance>1e-3")) + p9.aes(x="full_text_distance", y="abstract_only_distance") + p9.geom_point(fill="#a6cee3") + p9.scale_y_continuous(trans="log10") + p9.labs(x="Full Text Distance", y="Abstract Only Distance")
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the source image target_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2]}) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float}) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png')) elif params.debug == 'plot': print(p1)
def syntactic_diversity_plots(): with open('data/external/syntactic_diversity_table.json') as f: rows = json.load(f) parse_df = pd.DataFrame(rows) parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses'] melt_df = pd.melt( parse_df, id_vars=['dataset', 'depth', 'overlap', 'parses'], value_vars=['parse_ratio', 'unique_parses'], var_name='metric', value_name='y' ) def label_facet(name): if name == 'parse_ratio': return 'Average Unique Parses per Instance' elif name == 'unique_parses': return 'Count of Unique Parses' def label_y(ys): formatted_ys = [] for y in ys: y = str(y) if y.endswith('000.0'): formatted_ys.append(y[:-5] + 'K') else: formatted_ys.append(y) return formatted_ys p = ( ggplot(melt_df) + aes(x='depth', y='y', color='dataset') + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet) + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('') + scale_color_discrete(name='Dataset') + scale_y_continuous(labels=label_y) + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'syn_div_plot.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='unique_parses', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Count of Unique Parses') + scale_color_discrete(name='Dataset') + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'n_unique_parses.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='parse_ratio', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Average Unique Parses per Instance') + scale_color_discrete(name='Dataset') + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + scale_y_continuous(limits=[0, 1]) + theme_fs() ) p.save(path.join(output_path, 'parse_ratio.pdf'))
import pandas as pd df = pd.read_csv("/home/shaury/Downloads/nptel/winequality-red.csv", delimiter=",") from plotnine import ggplot, geom_point, aes ggplot(df) + geom_point( mapping=aes(x=df["fixed acidity"], y=df["sulphates"], color=df["quality"])) from sklearn.model_selection import train_test_split x, y = df[[ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol" ]], df["quality"] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15) from sklearn.linear_model import LogisticRegression as LR ld = LR(solver='liblinear', random_state=0) ld.fit(x_train, y_train) y_pred = ld.predict(x_test) from sklearn.metrics import accuracy_score, confusion_matrix print(accuracy_score(y_pred, y_test)) print(confusion_matrix(y_pred, y_test))
inc_start = iglo_nest[iglo_nest.type == "incubation"].julian.min() inc_end = iglo_nest[iglo_nest.type == "incubation"].julian.max() inc_lbl_pos = inc_start + (inc_end - inc_start) / 2 hatch_start = iglo_nest[iglo_nest.type == "hatch"].julian.min() hatch_end = min(iglo_nest[iglo_nest.type == "hatch"].julian.max(), iglo.julian.max() + 2) hatch_lbl_pos = hatch_start + (hatch_end - hatch_start) / 2 xmin = min(inc_start, iglo.julian.min()) xmax = min(iglo_nest[iglo_nest.type == "hatch"].julian.max(), iglo.julian.max() + 2) (ggplot(data=iglo, mapping=aes(x='julian', y='ACI_mean', colour='site')) #+ facet_grid("panel~", scales="free") + xlab("Day") + ylab("Mean daily ACI (standardized)") + geom_point() + theme(legend_position="none") # + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) + geom_smooth(method="mavg", se=False, method_args={ "window": 4, "center": True, "min_periods": 1 }) + annotate("rect", xmin=[inc_start, hatch_start], xmax=[inc_end, hatch_end], ymin=-math.inf, ymax=math.inf, alpha=0.1, fill=["red", "blue"]) +
def day_night_attacks(Data, Data_m): print('======= Creating day_night_attacks =======') #Filter montlhy and ever Symptomes freq_all = Data[(Data.Group == 'sy')] freq_m = Data_m[(Data_m.Group == 'sy')] test = freq_all[(pd.isna(freq_all.year) == 0) & (pd.isna(freq_all.month) == 0)] Test_3 = pd.DataFrame(test.groupby("hour", as_index = False).count()) Test_3 = Test_3.iloc[:, 0:2] Test_3 = Test_3.rename(columns = {"Unnamed: 0": "n"}) test_m = freq_m[(pd.isna(freq_m.year) == 0) & (pd.isna(freq_m.month) == 0)] Test_3_m = pd.DataFrame(test_m.groupby("hour", as_index = False).count()) Test_3_m = Test_3_m.iloc[:, 0:2] Test_3_m = Test_3_m.rename(columns = {"Unnamed: 0": "n"}) plot =(p9.ggplot(data=Test_3, mapping=p9.aes(x='hour', y = 'n')) + p9.geom_point(color = 'red', size = 10) + p9.geom_line(color = 'red', size = 1) #+ p9.geom_point(color = 'red', size = 10) #+ p9.geom_line(color = 'red', size = 1) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.coord_cartesian(xlim = (1,25)) + p9.labs(x='Hours',y='No. of attacks') + p9.scale_x_discrete(limits = (range(1,25))) ) plot_month =(p9.ggplot(data=Test_3_m, mapping=p9.aes(x='hour', y = 'n')) #+ p9.geom_line(color = 'red', size = 5) + p9.geom_point(color = 'red', size = 10) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.coord_cartesian(xlim = (1,25)) + p9.labs(x='Hours',y='No. of attacks') + p9.scale_x_discrete(limits = (range(1,25))) ) #Creating and saving MONTHLY Grap_3 if (len(Test_3_m) > 0): #G3 = graph_3(freq_m) plot_month.save(filename = 'Graph_3.jpeg', plot = plot_month, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') #Creating and saving EVER Grap_3 if (len(freq_all) > 0): #G3 = graph_3(freq_all) plot.save(filename = 'Graph_ALL_3.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================day_night_attacks DONE ============================='))
def plot_scatter(dat, figsize=(16, 12)): return (pn.ggplot(dat, pn.aes(x='val', y='response')) + pn.geom_point() + pn.geom_smooth(method='lm') + pn.facet_wrap("var", scales='free_x') + pn.theme_bw() + pn.theme(figure_size=figsize, subplots_adjust={'hspace': 0.25}))
def plot_char_percent_vs_accuracy_smooth( self, expo=False, no_models=False, columns=False ): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f"Setting limits to: {limits}") else: limits = [0, 1] if expo: if ( os.path.exists("data/external/all_human_gameplay.json") and not self.no_humans ): with open("data/external/all_human_gameplay.json") as f: all_gameplay = json.load(f) frames = [] for event, name in [ ("parents", "Intermediate"), ("maryland", "Expert"), ("live", "National"), ]: if self.merge_humans: name = "Human" gameplay = all_gameplay[event] if event != "live": control_correct_positions = gameplay[ "control_correct_positions" ] control_wrong_positions = gameplay[ "control_wrong_positions" ] control_positions = ( control_correct_positions + control_wrong_positions ) control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0] ) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = ( control_sorted_result.cumsum() / control_sorted_result.shape[0] ) control_df = pd.DataFrame( {"correct": control_y, "char_percent": control_x} ) control_df["Dataset"] = "Regular Test" control_df["Guessing_Model"] = f" {name}" frames.append(control_df) adv_correct_positions = gameplay["adv_correct_positions"] adv_wrong_positions = gameplay["adv_wrong_positions"] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0] ) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({"correct": adv_y, "char_percent": adv_x}) adv_df["Dataset"] = "IR Adversarial" adv_df["Guessing_Model"] = f" {name}" frames.append(adv_df) if len(gameplay["advneural_correct_positions"]) > 0: adv_correct_positions = gameplay[ "advneural_correct_positions" ] adv_wrong_positions = gameplay["advneural_wrong_positions"] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0] ) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = ( adv_sorted_result.cumsum() / adv_sorted_result.shape[0] ) adv_df = pd.DataFrame( {"correct": adv_y, "char_percent": adv_x} ) adv_df["Dataset"] = "RNN Adversarial" adv_df["Guessing_Model"] = f" {name}" frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df["Guessing_Model"].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df["Guessing_Model"] = human_df["Guessing_Model"].astype( human_dtype ) dataset_dtype = CategoricalDtype( ["Regular Test", "IR Adversarial", "RNN Adversarial"], ordered=True, ) human_df["Dataset"] = human_df["Dataset"].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape=".") else: df = self.char_plot_df if 1 not in self.rounds: df = df[df["Dataset"] != "Round 1 - IR Adversarial"] if 2 not in self.rounds: df = df[df["Dataset"] != "Round 2 - IR Adversarial"] df = df[df["Dataset"] != "Round 2 - RNN Adversarial"] p = ggplot(df) if self.save_df is not None: eprint(f"Saving df to: {self.save_df}") df.to_json(self.save_df) if ( os.path.exists("data/external/all_human_gameplay.json") and not self.no_humans ): eprint("Loading human data") p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap("Guessing_Model", ncol=1) else: facet_conf = facet_wrap("Guessing_Model", nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth( method="mavg", se=False, method_args={"window": 400} ) else: chart = stat_summary_bin( fun_data=mean_no_se, bins=20, shape=".", linetype="None", size=0.5, ) else: chart = None p = p + facet_conf + aes(x="char_percent", y="correct", color="Dataset") if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, 0.5, 1]) + coord_cartesian(ylim=limits) + xlab("Percent of Question Revealed") + ylab("Accuracy") + theme( # legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={"t": 6, "b": 6, "l": 1, "r": 5}) ) + scale_color_manual( values=["#FF3333", "#66CC00", "#3333FF", "#FFFF33"], name="Questions", ) ) if self.title != "": p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f"Saving df to: {self.save_df}") df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x="char_percent", y="correct", color="Guessing_Model") + stat_smooth(method="mavg", se=False, method_args={"window": 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
def plot_predictions_actual(pred_df, figsize): return (pn.ggplot(pred_df, pn.aes(x='y', y='pred')) + pn.geom_point() + pn.geom_ribbon(pn.aes(ymin='lb', ymax='ub'), alpha=0.3) + pn.geom_abline(slope=1, intercept=0) + pn.theme_bw() + pn.theme(figure_size=figsize))
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the source image target_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({ 'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2] }) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({ 'color': str, 'chip': str, 'target': float, 'source': float }) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Autoincrement the device counter params.device += 1 # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png')) elif params.debug == 'plot': print(p1)
def plot_predictions_residuals(pred_df, figsize): return (pn.ggplot(pred_df, pn.aes(x='y', y='resid')) + pn.geom_point() + pn.geom_hline(yintercept=0) + pn.theme_bw() + pn.theme(figure_size=figsize))
# # (C) Copyright 2021 Pavel Tisnovsky # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Pavel Tisnovsky # from plotnine import ggplot, geom_point, aes, stat_smooth from plotnine.data import mtcars g = (ggplot(mtcars, aes("wt", "mpg", color="factor(gear)")) + geom_point() + stat_smooth(method="lm")) g.save("10.png")
user_stat = user_stat.loc[user_stat.n_records > 20] print(len(user_stat)) print(len(df.loc[df.user_n_records > 20])) print(len(df)) print(len(set(df.qid))) user_stat['log_n_records'] = pd.Series(user_stat.n_records.apply(np.log), index=user_stat.index) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) p0 = ggplot(user_stat) \ + geom_point(aes(x='ratio', y='accuracy', size='n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + theme(aspect_ratio=1) p0.save('protobowl_users.pdf') # p0.draw() print('p0 done') p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + theme(aspect_ratio=0.3) p1.save('protobowl_hist.pdf') # p1.draw() print('p1 done')