def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width = 8.5, height = 11) elif args.saveas == "png": grdevices.png(plot_file, width = 8.5, height = 11, units = "in", res = 300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def makePlot(grdevices, plotName, samp_set1_vals, samp_set2_vals, image_file_type): samp_vector = ["set1" for i in range(len(samp_set1_vals))] samp_vector.extend(["set2" for i in range(len(samp_set2_vals))]) dframe = robjects.DataFrame({ "sample": robjects.StrVector(samp_vector), "value": robjects.FloatVector(samp_set1_vals + samp_set2_vals) }) gp = ggplot2.ggplot(dframe) pp = gp + \ ggplot2.aes_string(x="sample", y='value') + \ ggplot2.geom_boxplot() +\ ggplot2.geom_jitter() +\ ggplot2.theme_bw() if image_file_type == "pdf": grdevices.pdf(file=plotName) else: grdevices.png(file=plotName, width=512, height=512) pp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <raw file>' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide raw file') else: raw_file = args[0] # collect data coords = [] main_cov = [] control_cov = [] for line in open(raw_file): a = line.split() coords.append(int(a[0])) main_cov.append(float(a[1])) control_cov.append(float(a[2])) # data structures tss_i = ro.IntVector(range(-options.upstream,options.downstream+1)) labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1)) cov = ro.FloatVector(main_cov + control_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_colour_discrete('') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ ggplot2.scale_x_continuous('TSS Position') + \ ggplot2.scale_colour_discrete('') + \ ggplot2.theme_bw() if options.ymax == None: gp += ggplot2.scale_y_continuous('Coverage') else: gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax])) # save to file grdevices.pdf(file='%s_and.pdf' % options.out_prefix) gp.plot() grdevices.dev_off()
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile=None, height=120, fsize=12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({ 'feature': np.repeat(feat_names, nexpt), 'Classification': np.reshape(clf_coef, (clf_coef.size, )), 'Regression': np.reshape(reg_coef, (reg_coef.size, )) }) df2 = pd.melt(df, id_vars='feature', var_name='fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm') return df
def plot_cv_r2(pandas_df, outfile, fsize = 10, height = 120, max_width = 50, xlab = 'Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') sizes = robjects.IntVector([s for s in sizes \ if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {'sizes': sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def plot_cels(expr, expt_names, expt_name_idx, cel_names, outdir = None): """Makes correlation plots between CEL files for the same cell type""" fsize = 10 names_1 = [] names_2 = [] cors = [] titles = [] for ex_idx, ex in enumerate(expt_names): # Indices of CEL files (columns of expr) corresponding to that cell type tmp_idx = expt_name_idx[ex] plot_idx = 0 for i in range(len(tmp_idx)): name1 = re.sub('_', '.', cel_names[tmp_idx[i]]) for j in range(i + 1, len(tmp_idx)): name2 = re.sub('_', '.', cel_names[tmp_idx[j]]) plot_idx += 1 cor = np.corrcoef(expr[:, tmp_idx[i]], expr[:, tmp_idx[j]])[0, 1] names_1.append(name1) names_2.append(name2) cors.append(cor) titles.append(ex + '-' + str(plot_idx)) df = ro.DataFrame({'x':ro.FloatVector(expr[:, tmp_idx[i]]), 'y':ro.FloatVector(expr[:, tmp_idx[j]])}) gp = ggplot2.ggplot(df) + ggplot2.aes_string(x = 'x', y = 'y') + \ ggplot2.geom_point(size = 1) + \ ggplot2.scale_x_continuous(name1) + ggplot2.scale_y_continuous(name2) + \ ggplot2.theme_bw() + ggplot2.ggtitle('{:s}-{:d} ({:.4f})'.format(ex, plot_idx, cor)) + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.title.x':ggplot2.element_text(size = 8), 'axis.text.y':ggplot2.element_text(size = fsize), 'axis.title.y':ggplot2.element_text(size = 8, angle = 90), 'plot.title':ggplot2.element_text(size = fsize)}) if outdir is None: gp.plot() else: if not os.path.isdir(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, ex + '-' + str(plot_idx) + '.png') ro.r.ggsave(filename = outfile, plot = gp, width = 85, height = 85, unit = 'mm') df = pd.DataFrame({'name1':names_1, 'name2':names_2, 'cor':cors}, index = titles) if not outdir is None: df.to_csv(os.path.join(outdir, 'cor_summary.txt'), sep = '\t') return df
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def generate_step3_5_lrr_acc20_line_chart(subgroups_to_lrrs_acc20mean, prefix=''): pandas2ri.activate() subgroups_to_lrr_count = {} columns_to_data = {'subgroup': [], 'pos': [], 'acc20': []} for subgroup, (acc20means, acc20_count) in subgroups_to_lrrs_acc20mean.items(): subgroups_to_lrr_count[subgroup] = acc20_count for index, acc20mean in enumerate(acc20means): columns_to_data['subgroup'].append(subgroup) columns_to_data['pos'].append(index + 1) columns_to_data['acc20'].append(acc20mean) # Write the count of LRRs for each subgroup to file with open(os.path.join(OUTPUT_PATH, prefix + "step3_5_lrr_count.txt"), 'w') as f: for subgroup, lrr_count in subgroups_to_lrr_count.items(): f.write(str.format("{}: {}\n", subgroup, lrr_count)) # Generate the line chart file r_columns_to_data = { 'subgroup': ro.StrVector(columns_to_data['subgroup']), 'pos': ro.IntVector(columns_to_data['pos']), 'acc20': ro.FloatVector(columns_to_data['acc20']) } df = ro.DataFrame(r_columns_to_data) line_chart_file_path = os.path.join(OUTPUT_PATH, prefix + "step3_5_lrr_acc20_line.png") logging.debug( str.format("The Data Frame for file {}: \n{}", line_chart_file_path, df)) grdevices.png(file=line_chart_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.aes_string(x='pos', y='acc20', group='subgroup', colour='subgroup') + \ ggplot2.geom_point(size=4, shape=20) + \ ggplot2.geom_line(size=3) + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=20)}) + \ ggplot2.scale_x_continuous(breaks=ro.IntVector(range(1, 25)), labels=ro.StrVector(list('LxxLxLxxNxLsGxIPxxLxxLxx'))) pp.plot() logging.info(str.format("Output step3 file {}", line_chart_file_path)) grdevices.dev_off()
def plot_thresh_distr(motif_names, thresh, out_dir, width = 350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif':motif_names, 'thresh':thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep = '\t', index = False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename = os.path.join(out_dir, 'count_thresh_bar' + ext), plot = gp, width = width, height = 300, unit = 'mm')
def plot_cv_r2(pandas_df, outfile, fsize=10, height=120, max_width=50, xlab='Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm')
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count, file_name): columns_to_data = {'place': [], 'pos': [], 'count': []} max_count = 0 for place_pos_type, n_count in place_type_pos_type_to_count.items(): place_type, pos_type = place_pos_type.split('_') columns_to_data['place'].append(place_type) columns_to_data['pos'].append(pos_type) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'place': ro.StrVector(columns_to_data['place']), 'pos': ro.StrVector(columns_to_data['pos']), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) if max_count > 1000: max_count = int(max_count / 1000 * 1000 + 1000) else: max_count = int(max_count / 100 * 100 + 100) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='pos', y='count', fill='place') + \ ggplot2.geom_bar(position="dodge", stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2, vjust=-0.5) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def _generate_step3_5_ss_acc20_line_chart(ts_to_acc20s, tname, line_chart_file_path): logging.debug( str.format("Begin to generate {}, data {}", line_chart_file_path, ts_to_acc20s)) ts_to_acc20mean = calc_acc20mean_by_types(ts_to_acc20s) columns_to_data = {tname: [], 'site': [], 'acc20': []} for ss, acc20means in ts_to_acc20mean.items(): for index, acc20mean in enumerate(acc20means): columns_to_data[tname].append(ss) columns_to_data['site'].append(index - 5) columns_to_data['acc20'].append(acc20mean) # Generate the line chart file r_columns_to_data = { tname: ro.StrVector(columns_to_data[tname]), 'site': ro.IntVector(columns_to_data['site']), 'acc20': ro.FloatVector(columns_to_data['acc20']) } df = ro.DataFrame(r_columns_to_data) logging.debug( str.format("The Data Frame for file {}: \n{}", line_chart_file_path, df)) grdevices.png(file=line_chart_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.aes_string(x='site', y='acc20', group=tname, colour=tname) + \ ggplot2.geom_point(size=4, shape=20) + \ ggplot2.geom_line(size=3) + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=20)}) + \ ggplot2.scale_x_continuous(breaks=ro.IntVector(list(range(-5, 6))), labels=ro.StrVector(['-5', '-4', '-3', '-2', '-1', 'N', '1', '2', '3', '4', '5'])) pp.plot() logging.info(str.format("Output step3 file {}", line_chart_file_path)) grdevices.dev_off()
def plot_thresh_distr(motif_names, thresh, out_dir, width=350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif': motif_names, 'thresh': thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep='\t', index=False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename=os.path.join(out_dir, 'count_thresh_bar' + ext), plot=gp, width=width, height=300, unit='mm')
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt), 'Classification':np.reshape(clf_coef, (clf_coef.size,)), 'Regression':np.reshape(reg_coef, (reg_coef.size,))}) df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm') return df
def makePlot(grdevices, plotName, samp_set1_vals, samp_set2_vals, image_file_type): samp_vector = ["set1" for i in range(len(samp_set1_vals))] samp_vector.extend(["set2" for i in range(len(samp_set2_vals))]) dframe = robjects.DataFrame({"sample":robjects.StrVector(samp_vector), "value":robjects.FloatVector(samp_set1_vals + samp_set2_vals)}) gp = ggplot2.ggplot(dframe) pp = gp + \ ggplot2.aes_string(x="sample", y='value') + \ ggplot2.geom_boxplot() +\ ggplot2.geom_jitter() +\ ggplot2.theme_bw() if image_file_type == "pdf": grdevices.pdf(file=plotName) else: grdevices.png(file=plotName, width=512, height=512) pp.plot() grdevices.dev_off()
grdevices.dev_off() grdevices.png('../../_static/graphics_ggplot2withgrid.png', width = 612, height = 612, antialias="subpixel", type="cairo") #-- gridwithggplot2-begin grid.newpage() # create a viewport as the main plot vp = grid.viewport(width = 1, height = 1) vp.push() p = ggplot2.ggplot(datasets.rock) + \ ggplot2.geom_point(ggplot2.aes_string(x = 'area', y = 'peri')) + \ ggplot2.theme_bw() p.plot(vp = vp) vp = grid.viewport(width = 0.6, height = 0.6, x = 0.37, y=0.69) vp.push() p = ggplot2.ggplot(datasets.rock) + \ ggplot2.geom_point(ggplot2.aes_string(x = 'area', y = 'shape')) + \ ggplot2.opts(**{'axis.text.x': ggplot2.theme_text(angle = 45)}) p.plot(vp = vp) #-- gridwithggplot2-end grdevices.dev_off()
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) step = args.skip # make a data frame of the lists d = { 'start': robjects.FloatVector( [r_start_times[n] for n in xrange(0, len(r_start_times), step)]), 'lengths': robjects.IntVector( [r_read_lengths[n] for n in xrange(0, len(r_read_lengths), step)]), 'cumul': robjects.IntVector( [cumulative[n] for n in xrange(0, len(cumulative), step)]) } df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
height=612, antialias="subpixel", type="cairo") #-- gridwithggplot2-begin grid.newpage() # create a viewport as the main plot vp = grid.viewport(width=1, height=1) vp.push() tmpenv = data(datasets).fetch("rock") rock = tmpenv["rock"] p = ggplot2.ggplot(rock) + \ ggplot2.geom_point(ggplot2.aes_string(x = 'area', y = 'peri')) + \ ggplot2.theme_bw() p.plot(vp=vp) vp = grid.viewport(width=0.6, height=0.6, x=0.37, y=0.69) vp.push() p = ggplot2.ggplot(rock) + \ ggplot2.geom_point(ggplot2.aes_string(x = 'area', y = 'shape')) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) p.plot(vp=vp) #-- gridwithggplot2-end grdevices.dev_off() #---
grdevices = importr('grDevices') ro.r('''change_name=function(pop_size, generations,freq){ name=sprintf("../results/mcm_%sNe_%sfreq_%sgen.png", pop_size,freq,generations) return(name)} ''') name = ro.r['change_name'] name = name(args.ps, args.gen, args.freq) print("Output figure in:", name) grdevices.png(file=name, width=700, height=700) gp = ggplot2.ggplot(res2) pp = gp + ggplot2.aes_string( x='Counts', y='Proportion') + ggplot2.geom_bar( stat="identity", color="darkgoldenrod3") + ggplot2.theme_bw() pp.plot() grdevices.dev_off() print("Plot done!") elif ( args.diff ): ###references:doi: 10.1093/molbev/msx254 && https://doi.org/10.1111/j.1365-294X.2010.04997.x p = args.freq N = args.ps t = args.gen newx = [] x = np.arange(0, 1.001, 0.001001001) res = [0] * len(x) print("Estimating allele counts") for i in range(1, 101):
#!/usr/bin/python from datetime import datetime import sys import subprocess import os import math import rpy2.robjects as robjects robjects.r('library("scales")') import rpy2.robjects.lib.ggplot2 as ggplot2 ggplot2.theme_set(ggplot2.theme_bw()) #print ggplot2.theme_get() from rpy2.robjects.packages import importr from rpy2.robjects import FloatVector, StrVector, IntVector, DataFrame def ggplot2_options(): return ggplot2.opts( **{ 'axis.title.x': ggplot2.theme_blank(), 'axis.title.y': ggplot2.theme_text( family='serif', face='bold', size=15, angle=90, vjust=0.2), 'axis.text.x': ggplot2.theme_text(family='serif', size=15), 'axis.text.y': ggplot2.theme_text(family='serif', size=15), 'legend.title': ggplot2.theme_text(family='serif', face='bold', size=15),
iris_py = pandas.read_csv("/home/yarden/iris.csv") iris_py = iris_py.rename(columns={"Name": "Species"}) corrs = [] from scipy.stats import spearmanr for species in set(iris_py.Species): entries = iris_py[iris_py["Species"] == species] c = spearmanr(entries["SepalLength"], entries["SepalWidth"]) print "c: ", c # compute r.cor(x, y) and divide up by Species # Assume we get a vector of length Species saying what the # correlation is for each Species' Petal Length/Width p = ggplot2.ggplot(iris) + \ ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \ ggplot2.facet_wrap(Formula("~Species")) p.plot() r["dev.off"]() sys.exit(1) grdevices = importr('grDevices') ggplot2.theme_set(ggplot2.theme_bw(12)) p = ggplot2.ggplot(iris) + \ ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \ ggplot2.facet_wrap(Formula('~ Species'), ncol=2, nrow = 2) + \ ggplot2.geom_text(aes_string(x="Sepal.Length", y="Sepal.Width"), label="t") + \ ggplot2.GBaseObject(r('ggplot2::coord_fixed')()) # aspect ratio p.plot()
#!/usr/bin/env python from problems import * from utils import * from config import * import sys import rpy2.robjects as robjects robjects.r('library("scales")') import rpy2.robjects.lib.ggplot2 as ggplot2 ggplot2.theme_set(ggplot2.theme_bw ()) from rpy2.robjects.packages import importr from rpy2.robjects import FloatVector, StrVector, IntVector, DataFrame def ggplot2_options (): def normal_text(): return ggplot2.theme_text(family = 'serif', size = 15) def bold_text(): return ggplot2.theme_text(family = 'serif', face = 'bold', size = 15) def rotated_text(): return ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, angle=90, vjust=0.2) return ggplot2.opts (**{'axis.title.x' : ggplot2.theme_blank(), 'axis.title.y' : rotated_text(), 'axis.text.x' : normal_text(), 'axis.text.y' : normal_text(), 'legend.title' : bold_text(), 'legend.text' : normal_text(), 'aspect.ratio' : 0.6180339888, 'strip.text.x' : normal_text(), })
number_of_peaks = len(dataf[0]) cvI = [] newRow = [] for i in range(1,number_of_peaks+1): row = dataf.rx(i,True) rowA = np.array(row) newRow.append(rowA[2:]) cvI.append(cv(rowA[2:])) #cv.append(rowA[2:].std()/rowA[2:].mean()) cv_r=robjects.conversion.py2ri(cvI) df_cv = {'CV' : cv_r} dataf_cv = robjects.DataFrame(df_cv) dtf_cv = robjects.r.melt(dataf_cv) d=dataf.cbind(dtf_cv.rx(2)) d.names[tuple(d.colnames).index('value')] = 'CV' #d = base.merge_data_frame(dataf,dtf_cv.rx(2)) utilis.write_csv(d, options.csv_output) dc = dtf_cv.cbind(n_peak = robjects.IntVector(range(1,number_of_peaks+1))) #n_peak = robjects.IntVector(1,number_of_peaks) gp = ggplot2.ggplot(dc) pp=gp+ggplot2.aes_string(x='n_peak',y='value') + ggplot2.geom_point()+ggplot2.theme_bw()+ ggplot2.ggtitle('Coefficient of Variation')+ \ ggplot2.scale_x_continuous("Number of Peaks")+ ggplot2.scale_y_continuous("CV") r.X11() pp.plot()
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) # make a data frame of the lists d = {'start': r_start_times, 'lengths': r_read_lengths, 'cumul': cumulative} df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 8.5, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 8.5, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def plot_cels(expr, expt_names, expt_name_idx, cel_names, outdir=None): """Makes correlation plots between CEL files for the same cell type""" fsize = 10 names_1 = [] names_2 = [] cors = [] titles = [] for ex_idx, ex in enumerate(expt_names): # Indices of CEL files (columns of expr) corresponding to that cell type tmp_idx = expt_name_idx[ex] plot_idx = 0 for i in range(len(tmp_idx)): name1 = re.sub('_', '.', cel_names[tmp_idx[i]]) for j in range(i + 1, len(tmp_idx)): name2 = re.sub('_', '.', cel_names[tmp_idx[j]]) plot_idx += 1 cor = np.corrcoef(expr[:, tmp_idx[i]], expr[:, tmp_idx[j]])[0, 1] names_1.append(name1) names_2.append(name2) cors.append(cor) titles.append(ex + '-' + str(plot_idx)) df = ro.DataFrame({ 'x': ro.FloatVector(expr[:, tmp_idx[i]]), 'y': ro.FloatVector(expr[:, tmp_idx[j]]) }) gp = ggplot2.ggplot(df) + ggplot2.aes_string(x = 'x', y = 'y') + \ ggplot2.geom_point(size = 1) + \ ggplot2.scale_x_continuous(name1) + ggplot2.scale_y_continuous(name2) + \ ggplot2.theme_bw() + ggplot2.ggtitle('{:s}-{:d} ({:.4f})'.format(ex, plot_idx, cor)) + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.title.x':ggplot2.element_text(size = 8), 'axis.text.y':ggplot2.element_text(size = fsize), 'axis.title.y':ggplot2.element_text(size = 8, angle = 90), 'plot.title':ggplot2.element_text(size = fsize)}) if outdir is None: gp.plot() else: if not os.path.isdir(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, ex + '-' + str(plot_idx) + '.png') ro.r.ggsave(filename=outfile, plot=gp, width=85, height=85, unit='mm') df = pd.DataFrame({ 'name1': names_1, 'name2': names_2, 'cor': cors }, index=titles) if not outdir is None: df.to_csv(os.path.join(outdir, 'cor_summary.txt'), sep='\t') return df
def main(): usage = 'usage: %prog [options] <raw file>' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide raw file') else: raw_file = args[0] # collect data coords = [] main_cov = [] control_cov = [] for line in open(raw_file): a = line.split() coords.append(int(a[0])) main_cov.append(float(a[1])) control_cov.append(float(a[2])) # data structures tss_i = ro.IntVector(range(-options.upstream, options.downstream + 1)) labels = ro.StrVector(['Main'] * (options.upstream + options.downstream + 1) + ['Control'] * (options.upstream + options.downstream + 1)) cov = ro.FloatVector(main_cov + control_cov) df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_colour_discrete('') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ ggplot2.scale_x_continuous('TSS Position') + \ ggplot2.scale_colour_discrete('') + \ ggplot2.theme_bw() if options.ymax == None: gp += ggplot2.scale_y_continuous('Coverage') else: gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector( [0, options.ymax])) # save to file grdevices.pdf(file='%s_and.pdf' % options.out_prefix) gp.plot() grdevices.dev_off()
def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception( 'Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width=8.5, height=11) elif args.saveas == "png": grdevices.png(plot_file, width=8.5, height=11, units="in", res=300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") sizes = robjects.IntVector([s for s in sizes if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {"sizes": sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print("Type enter to exit.") raw_input()
def plot_qc_reads(qc_df): """ Plot number of reads part of a pipeline QC file. """ # Record NA values as 0 qc_df = qc_df.fillna(0)#.set_index("sample") cols = ["sample", "num_reads", "num_mapped", "num_unique_mapped", "num_junctions"] qc_df = qc_df[cols] melted_qc = pandas.melt(qc_df, id_vars=["sample"]) qc_r = conversion_pydataframe(melted_qc) labels = tuple(["num_reads", "num_mapped", "num_unique_mapped", "num_junctions"]) labels = robj.StrVector(labels) variable_i = qc_r.names.index('variable') qc_r[variable_i] = robj.FactorVector(qc_r[variable_i], levels = labels) ggplot2.theme_set(ggplot2.theme_bw(12)) scales = importr("scales") r_opts = r.options(scipen=4) p = ggplot2.ggplot(qc_r) + \ ggplot2.geom_point(aes_string(x="sample", y="value")) + \ ggplot2.scale_y_continuous(trans=scales.log10_trans(), breaks=scales.trans_breaks("log10", robj.r('function(x) 10^x')), labels=scales.trans_format("log10", robj.r('math_format(10^.x)'))) + \ r.xlab("CLIP-Seq samples") + \ r.ylab("No. reads") + \ ggplot2.coord_flip() + \ ggplot2.facet_wrap(Formula("~ variable"), ncol=1) + \ theme(**{"panel.grid.major.x": element_blank(), "panel.grid.minor.x": element_blank(), "panel.grid.major.y": theme_line(size=0.5,colour="grey66",linetype=3)}) p.plot() return r.par(mfrow=np.array([1,2])) num_samples = len(qc_df.num_reads) r.par(bty="n", lwd=1.7, lty=2) r_opts = r.options(scipen=4) r.options(r_opts) r.dotchart(convert_to_r_matrix(qc_df[["num_reads", "num_mapped", "num_unique_mapped"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8) r.par(bty="n") r.dotchart(convert_to_r_matrix(qc_df[["num_ribosub_mapped", "num_ribo", "num_junctions"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8)