def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width = 8.5, height = 11) elif args.saveas == "png": grdevices.png(plot_file, width = 8.5, height = 11, units = "in", res = 300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def make_output(tss_cov, out_prefix, upstream, downstream): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-upstream,downstream+1): print >> raw_out, '%d\t%e' % (i, tss_cov[upstream+i]) raw_out.close() # make plot data structures tss_i = ro.IntVector(range(-upstream,downstream+1)) cov = ro.FloatVector(tss_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_full.pdf' % out_prefix) gp.plot() grdevices.dev_off() # construct zoomed plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index',limits=ro.IntVector([-1000,1000])) + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_zoom.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def interval(locus_table, interval_table, intervals, loci, boxplot = True): qry = get_interval_query(intervals, loci, locus_table, interval_table) frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry)) # because we're sorting by interval, which is a factor, we need to # explicitly re-sort the data by the first integer value # of the interval. This is a bit cumbersome, because sorting # in R is less than pleasant. sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) if boxplot: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(**{ 'outlier.size':0, 'alpha':0.3 } ) + \ ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \ alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') else: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi', fill='locus') + ggplot2.geom_bar() + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{ 'axis.text.x':ggplot2.theme_text(angle = -90, hjust = 0), 'legend.position':'none' }) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') return plot
def main(): usage = 'usage: %prog [options] <raw file>' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide raw file') else: raw_file = args[0] # collect data coords = [] main_cov = [] control_cov = [] for line in open(raw_file): a = line.split() coords.append(int(a[0])) main_cov.append(float(a[1])) control_cov.append(float(a[2])) # data structures tss_i = ro.IntVector(range(-options.upstream,options.downstream+1)) labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1)) cov = ro.FloatVector(main_cov + control_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_colour_discrete('') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ ggplot2.scale_x_continuous('TSS Position') + \ ggplot2.scale_colour_discrete('') + \ ggplot2.theme_bw() if options.ymax == None: gp += ggplot2.scale_y_continuous('Coverage') else: gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax])) # save to file grdevices.pdf(file='%s_and.pdf' % options.out_prefix) gp.plot() grdevices.dev_off()
def make_output_and(cov, control_cov, out_prefix, window): # dump raw counts to file raw_out = open("%s_raw.txt" % out_prefix, "w") for i in range(-window / 2, window / 2 + 1): print >> raw_out, "%d\t%e\t%e" % (i, cov[window / 2 + i], control_cov[window / 2 + i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(2 * range(-window / 2, window / 2 + 1)) cov_r = ro.FloatVector(cov + control_cov) labels = ro.StrVector(["Main"] * len(cov) + ["Control"] * len(control_cov)) df = ro.DataFrame({"splice_i": splice_i, "cov": cov_r, "label": labels}) # construct plot gp = ( ggplot2.ggplot(df) + ggplot2.aes_string(x="splice_i", y="cov", colour="label") + ggplot2.geom_point() + ggplot2.scale_x_continuous("Position relative to splice site") + ggplot2.scale_y_continuous("Coverage") + ggplot2.scale_colour_discrete("") ) # plot to file grdevices.pdf(file="%s.pdf" % out_prefix) gp.plot() grdevices.dev_off()
def make_output_and(cov, control_cov, out_prefix, window): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-window/2,window/2+1): print >> raw_out, '%d\t%e\t%e' % (i, cov[window/2+i], control_cov[window/2+i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(2*range(-window/2,window/2+1)) cov_r = ro.FloatVector(cov+control_cov) labels = ro.StrVector(['Main']*len(cov)+['Control']*len(control_cov)) df = ro.DataFrame({'splice_i':splice_i, 'cov':cov_r, 'label':labels}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='splice_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('Position relative to splice site') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf(file='%s.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) #parser.add_option() (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file') else: bam_file = args[0] align_lengths = {} for aligned_read in pysam.Samfile(bam_file, 'rb'): align_lengths[aligned_read.qlen] = align_lengths.get(aligned_read.qlen,0) + 1 min_len = min(align_lengths.keys()) max_len = max(align_lengths.keys()) # construct data frame len_r = ro.IntVector(range(min_len,max_len+1)) counts_r = ro.IntVector([align_lengths.get(l,0) for l in range(min_len,max_len+1)]) df = ro.DataFrame({'length':len_r, 'counts':counts_r}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='length', y='counts') + \ ggplot2.geom_bar(stat='identity') + \ ggplot2.scale_x_continuous('Alignment length') + \ ggplot2.scale_y_continuous('') # plot to file grdevices.pdf(file='align_lengths.pdf') gp.plot() grdevices.dev_off()
def compare_sum_barplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) #pdb.set_trace() frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''') if len(intervals) > 1: sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''agg_data''')) plot = gg_frame + \ ggplot2.aes_string( x = 'interval', y = 'pi', fill='factor(db)' ) + \ ggplot2.geom_bar(**{ 'position':'dodge', 'colour':'#767676', 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('net phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette="Blues") return plot
def make_output(cov, out_prefix, window): # dump raw counts to file raw_out = open("%s_raw.txt" % out_prefix, "w") for i in range(-window / 2, window / 2 + 1): print >> raw_out, "%d\t%e" % (i, cov[window / 2 + i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(range(-window / 2, window / 2 + 1)) cov = ro.FloatVector(cov) df = ro.DataFrame({"splice_i": splice_i, "cov": cov}) # construct plot gp = ( ggplot2.ggplot(df) + ggplot2.aes_string(x="splice_i", y="cov") + ggplot2.geom_point() + ggplot2.scale_x_continuous("Position relative to splice site") + ggplot2.scale_y_continuous("Coverage") ) # plot to file grdevices.pdf(file="%s.pdf" % out_prefix) gp.plot() grdevices.dev_off()
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream, downstream): # clean raw counts dir if os.path.isdir('%s_raw' % out_prefix): shutil.rmtree('%s_raw' % out_prefix) os.mkdir('%s_raw' % out_prefix) # dump raw counts to file for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: raw_out = open( '%s_raw/%s_%s.txt' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_')), 'w') for i in range(-upstream, downstream + 1): print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][ upstream + i], control_te_tss_cov[te][upstream + i]) raw_out.close() # clean plot dirs if os.path.isdir('%s_plot' % out_prefix): shutil.rmtree('%s_plot' % out_prefix) os.mkdir('%s_plot' % out_prefix) # make data structures tss_i = ro.IntVector(2 * range(-upstream, downstream + 1)) labels = ro.StrVector(['Main'] * (upstream + downstream + 1) + ['Control'] * (upstream + downstream + 1)) for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te]) df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf( file='%s_plot/%s_%s.pdf' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_'))) gp.plot() grdevices.dev_off()
def plot(data, x, y, ylabel, color, filename): gp = ggplot2.ggplot(data=data) gp = gp + \ ggplot2.geom_line(ggplot2.aes_string(x=x, y=y), color=color) + \ ggplot2.theme(**{'axis.text.x' : ggplot2.element_text(angle = 90, hjust = 1), 'strip.text.y' : ggplot2.element_text(size = 6, angle=90)}) + \ ggplot2.scale_y_continuous(ylabel) ggplot2.ggplot2.ggsave(filename, gp)
def _plot_with_rpy2(self, regions, filename): from rpy2 import robjects import rpy2.robjects.lib.ggplot2 as ggplot2 from rpy2.robjects.lib import grid from rpy2.robjects.packages import importr grdevices = importr('grDevices') base = importr('base') grdevices.pdf(file=filename + '.pdf') t = [x for x in range(-self.num_bins, self.num_bins + 1)] for region in regions[:self.num_regs]: if not np.any(region.weighted): logger.warning( "Warning: No data for region located on bin " + str(region.bin) + ". Not plotting this one.") continue middle = (len(region.weighted[0]) - 1) / 2 if middle < self.num_bins: logger.error("Warning: There are less bins calculated for regions than you want to plot.") sys.exit(1) d = {'map': robjects.StrVector( [str(m) for sublist in [[x] * len(t) for x in range(len(region.weighted))] for m in sublist]), 't': robjects.FloatVector(t * len(region.weighted)), 'e': robjects.FloatVector([i for sublist in region.weighted for i in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'p': robjects.FloatVector([-np.log10(x) for sublist in region.pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'c': robjects.FloatVector([-np.log10(x) for sublist in region.corrected_pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]])} dataf = robjects.DataFrame(d) gp = ggplot2.ggplot(dataf) # first yellow second red p1 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='e', group='map', colour='map'), alpha=0.8) + ggplot2.scale_y_continuous(trans='log2') + ggplot2.ggtitle( "\n".join(wrap("Bin " + str(region.bin) + " : " + str(region.positions)))) + ggplot2.labs( y="log Intensity") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.y': ggplot2.element_text(angle=45), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p2 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='p', group='map', colour='map'), alpha=0.8) + ggplot2.labs( y="-log10(p-value)") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p3 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='c', group='map', colour='map'), alpha=0.8) + ggplot2.labs(y="-log10(q-value)", x='bins (' + str(self.bin_res) + ' bp each)') + \ ggplot2.geom_hline(mapping=ggplot2.aes_string(yintercept=str(-np.log10(self.threshold))), colour='black', alpha=0.8, linetype='dashed') + ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") g1 = ggplot2.ggplot2.ggplotGrob(p1) g2 = ggplot2.ggplot2.ggplotGrob(p2) g3 = ggplot2.ggplot2.ggplotGrob(p3) robjects.globalenv["g"] = base.rbind(g1, g2, g3, size='first') robjects.r("grid::grid.draw(g)") grid.newpage() logger.debug('Plotted region ' + str(region.bin)) grdevices.dev_off()
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile=None, height=120, fsize=12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({ 'feature': np.repeat(feat_names, nexpt), 'Classification': np.reshape(clf_coef, (clf_coef.size, )), 'Regression': np.reshape(reg_coef, (reg_coef.size, )) }) df2 = pd.melt(df, id_vars='feature', var_name='fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm') return df
def plot_cv_r2(pandas_df, outfile, fsize = 10, height = 120, max_width = 50, xlab = 'Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
def single_locus_net_informativeness(locus_table, net_pi_table, locus): qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id AND locus = '{2}'"'''.format(locus_table, net_pi_table, locus) frame = robjects.r('''dbGetQuery(con, {})'''.format(qry)) gg_frame = ggplot2.ggplot(frame) plot = gg_frame + ggplot2.aes_string(x = 'time', y='pi') + \ ggplot2.geom_point(size = 3, alpha = 0.4) + \ ggplot2.scale_x_reverse('years ago') + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.opts(title = locus) return plot
def plot_summary(barcodes_obs, barcode_table, directory, expt_id): barcodes, counts, matches = get_vectors(barcodes_obs, barcode_table) df = DataFrame({'barcode': barcodes, 'count': counts, 'matched': matches}) p = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='factor(matched)', y='count / 1000000') + \ ggplot2.geom_boxplot(outlier_size = 0) + \ ggplot2.geom_jitter() + \ ggplot2.ggtitle(label = expt_id) + \ ggplot2.ggplot2.xlab(label = "") + \ ggplot2.scale_y_continuous(name = "Count\n(million reads)") filename = "{0}/{1}.png".format(directory, expt_id) grdevices.png(filename=filename, width=4, height=5, unit='in', res=300) p.plot() grdevices.dev_off()
def plot_cels(expr, expt_names, expt_name_idx, cel_names, outdir = None): """Makes correlation plots between CEL files for the same cell type""" fsize = 10 names_1 = [] names_2 = [] cors = [] titles = [] for ex_idx, ex in enumerate(expt_names): # Indices of CEL files (columns of expr) corresponding to that cell type tmp_idx = expt_name_idx[ex] plot_idx = 0 for i in range(len(tmp_idx)): name1 = re.sub('_', '.', cel_names[tmp_idx[i]]) for j in range(i + 1, len(tmp_idx)): name2 = re.sub('_', '.', cel_names[tmp_idx[j]]) plot_idx += 1 cor = np.corrcoef(expr[:, tmp_idx[i]], expr[:, tmp_idx[j]])[0, 1] names_1.append(name1) names_2.append(name2) cors.append(cor) titles.append(ex + '-' + str(plot_idx)) df = ro.DataFrame({'x':ro.FloatVector(expr[:, tmp_idx[i]]), 'y':ro.FloatVector(expr[:, tmp_idx[j]])}) gp = ggplot2.ggplot(df) + ggplot2.aes_string(x = 'x', y = 'y') + \ ggplot2.geom_point(size = 1) + \ ggplot2.scale_x_continuous(name1) + ggplot2.scale_y_continuous(name2) + \ ggplot2.theme_bw() + ggplot2.ggtitle('{:s}-{:d} ({:.4f})'.format(ex, plot_idx, cor)) + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.title.x':ggplot2.element_text(size = 8), 'axis.text.y':ggplot2.element_text(size = fsize), 'axis.title.y':ggplot2.element_text(size = 8, angle = 90), 'plot.title':ggplot2.element_text(size = fsize)}) if outdir is None: gp.plot() else: if not os.path.isdir(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, ex + '-' + str(plot_idx) + '.png') ro.r.ggsave(filename = outfile, plot = gp, width = 85, height = 85, unit = 'mm') df = pd.DataFrame({'name1':names_1, 'name2':names_2, 'cor':cors}, index = titles) if not outdir is None: df.to_csv(os.path.join(outdir, 'cor_summary.txt'), sep = '\t') return df
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def multiple_locus_net_informativeness_scatterplot(locus_table, net_pi_table, loci): if loci[0].lower() != 'all': qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id and locus in {2}"'''.format(locus_table, net_pi_table, tuple(loci)) else: qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id"'''.format(locus_table, net_pi_table) frame = robjects.r('''dbGetQuery(con, {})'''.format(qry)) gg_frame = ggplot2.ggplot(frame) plot = gg_frame + ggplot2.aes_string(x = 'time', y = 'pi') + \ ggplot2.geom_point(ggplot2.aes_string(colour = 'locus'), \ size = 3, alpha = 0.4) + ggplot2.scale_x_reverse('years ago') + \ ggplot2.scale_y_continuous('phylogenetic informativeness') return plot
def plot_thresh_distr(motif_names, thresh, out_dir, width = 350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif':motif_names, 'thresh':thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep = '\t', index = False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename = os.path.join(out_dir, 'count_thresh_bar' + ext), plot = gp, width = width, height = 300, unit = 'mm')
def plot_cv_r2(pandas_df, outfile, fsize=10, height=120, max_width=50, xlab='Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm')
def compare_mean_boxplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) if len(intervals) > 1: sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(ggplot2.aes_string(fill = 'factor(db)'), **{ 'outlier.size':3, 'outlier.colour':'#767676', 'outlier.alpha':0.3, 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('mean phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette='Blues') return plot
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count, file_name): columns_to_data = {'place': [], 'pos': [], 'count': []} max_count = 0 for place_pos_type, n_count in place_type_pos_type_to_count.items(): place_type, pos_type = place_pos_type.split('_') columns_to_data['place'].append(place_type) columns_to_data['pos'].append(pos_type) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'place': ro.StrVector(columns_to_data['place']), 'pos': ro.StrVector(columns_to_data['pos']), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) if max_count > 1000: max_count = int(max_count / 1000 * 1000 + 1000) else: max_count = int(max_count / 100 * 100 + 100) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='pos', y='count', fill='place') + \ ggplot2.geom_bar(position="dodge", stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2, vjust=-0.5) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def plot_thresh_distr(motif_names, thresh, out_dir, width=350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif': motif_names, 'thresh': thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep='\t', index=False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename=os.path.join(out_dir, 'count_thresh_bar' + ext), plot=gp, width=width, height=300, unit='mm')
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt), 'Classification':np.reshape(clf_coef, (clf_coef.size,)), 'Regression':np.reshape(reg_coef, (reg_coef.size,))}) df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm') return df
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream, downstream): # clean raw counts dir if os.path.isdir('%s_raw' % out_prefix): shutil.rmtree('%s_raw' % out_prefix) os.mkdir('%s_raw' % out_prefix) # dump raw counts to file for te in te_tss_cov: if te[0] in ['n','*','HERVH-int','L2a','AluSx','AluJb','MIRb','LTR7'] and te[1] in ['n','*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']: raw_out = open('%s_raw/%s_%s.txt' % (out_prefix,te[0].replace('/','_'),te[1].replace('/','_')),'w') for i in range(-upstream,downstream+1): print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][upstream+i], control_te_tss_cov[te][upstream+i]) raw_out.close() # clean plot dirs if os.path.isdir('%s_plot' % out_prefix): shutil.rmtree('%s_plot' % out_prefix) os.mkdir('%s_plot' % out_prefix) # make data structures tss_i = ro.IntVector(2*range(-upstream,downstream+1)) labels = ro.StrVector(['Main']*(upstream+downstream+1)+['Control']*(upstream+downstream+1)) for te in te_tss_cov: if te[0] in ['n','*','HERVH-int','L2a','AluSx','AluJb','MIRb','LTR7'] and te[1] in ['n','*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']: cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te]) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf(file='%s_plot/%s_%s.pdf' % (out_prefix,te[0].replace('/','_'),te[1].replace('/','_'))) gp.plot() grdevices.dev_off()
def make_output(cov, out_prefix, window): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-window/2,window/2+1): print >> raw_out, '%d\t%e' % (i, cov[window/2+i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(range(-window/2,window/2+1)) cov = ro.FloatVector(cov) df = ro.DataFrame({'splice_i':splice_i, 'cov':cov}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='splice_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('Position relative to splice site') + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file') else: bam_file = args[0] align_lengths = {} for aligned_read in pysam.Samfile(bam_file, 'rb'): align_lengths[aligned_read.qlen] = align_lengths.get( aligned_read.qlen, 0) + 1 min_len = min(align_lengths.keys()) max_len = max(align_lengths.keys()) # construct data frame len_r = ro.IntVector(range(min_len, max_len + 1)) counts_r = ro.IntVector( [align_lengths.get(l, 0) for l in range(min_len, max_len + 1)]) df = ro.DataFrame({'length': len_r, 'counts': counts_r}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='length', y='counts') + \ ggplot2.geom_bar(stat='identity') + \ ggplot2.scale_x_continuous('Alignment length') + \ ggplot2.scale_y_continuous('') # plot to file grdevices.pdf(file='align_lengths.pdf') gp.plot() grdevices.dev_off()
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) # make a data frame of the lists d = {'start': r_start_times, 'lengths': r_read_lengths, 'cumul': cumulative} df = robjects.DataFrame(d) # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_point() \ + ggplot2.geom_line() \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 8.5, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 8.5, height = 8.5, units = "in", res = 300) else: print >>sys.stderr, "Unrecognized extension for %s!" % (plot_file) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector([x[3] for x in combos_r]) d['group'] = StrVector([d['code'][x] + ':' + d['sequence'][x] for x in range(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.labs(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width = 712, height = 512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme")
def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception( 'Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width=8.5, height=11) elif args.saveas == "png": grdevices.png(plot_file, width=8.5, height=11, units="in", res=300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
ggplot2.scale_fill_gradient(high = 'blue', low = 'red') + \ ggplot2.scale_fill_continuous(name = "Obama Vote Share") + \ ggplot2.scale_colour_continuous(name = "Obama Vote Share") + \ ggplot2.opts(**{'legend.position': 'left', 'legend.key.size': robjects.r.unit(2, 'lines'), 'legend.title' : ggplot2.theme_text(size = 14, hjust=0), \ 'legend.text': ggplot2.theme_text(size = 12), 'title' : "Obama Vote Share and Distance to Railroads in IL", \ 'plot.title': ggplot2.theme_text(size = 24), 'plot.margin': robjects.r.unit(robjects.r.rep(0,4),'lines'), \ 'panel.background': ggplot2.theme_blank(), 'panel.grid.minor': ggplot2.theme_blank(), 'panel.grid.major': ggplot2.theme_blank(), \ 'axis.ticks': ggplot2.theme_blank(), 'axis.title.x': ggplot2.theme_blank(), 'axis.title.y': ggplot2.theme_blank(), \ 'axis.title.x': ggplot2.theme_blank(), 'axis.title.x': ggplot2.theme_blank(), 'axis.text.x': ggplot2.theme_blank(), \ 'axis.text.y': ggplot2.theme_blank()} ) + \ ggplot2.geom_line(ggplot2.aes(x='long', y='lat', group='group'), data=IL_railroads, color='grey', size=0.2) + \ ggplot2.coord_equal() p_map.plot() ## add the scatterplot ## define layout of subplot with viewports vp_sub = grid.viewport(x = 0.19, y = 0.2, width = 0.32, height = 0.4) p_sub = ggplot2.ggplot(RR_distance) + \ ggplot2.aes_string(x = 'OBAMA_SHAR', y= 'NEAR_DIST') + \ ggplot2.geom_point(ggplot2.aes(color='OBAMA_SHAR')) + \ ggplot2.stat_smooth(color="black") + \ ggplot2.opts(**{'legend.position': 'none'}) + \ ggplot2.scale_x_continuous("Obama Vote Share") + \ ggplot2.scale_y_continuous("Distance to nearest Railroad") p_sub.plot(vp=vp_sub) grdevices.dev_off()
def plot_qc_reads(qc_df): """ Plot number of reads part of a pipeline QC file. """ # Record NA values as 0 qc_df = qc_df.fillna(0)#.set_index("sample") cols = ["sample", "num_reads", "num_mapped", "num_unique_mapped", "num_junctions"] qc_df = qc_df[cols] melted_qc = pandas.melt(qc_df, id_vars=["sample"]) qc_r = conversion_pydataframe(melted_qc) labels = tuple(["num_reads", "num_mapped", "num_unique_mapped", "num_junctions"]) labels = robj.StrVector(labels) variable_i = qc_r.names.index('variable') qc_r[variable_i] = robj.FactorVector(qc_r[variable_i], levels = labels) ggplot2.theme_set(ggplot2.theme_bw(12)) scales = importr("scales") r_opts = r.options(scipen=4) p = ggplot2.ggplot(qc_r) + \ ggplot2.geom_point(aes_string(x="sample", y="value")) + \ ggplot2.scale_y_continuous(trans=scales.log10_trans(), breaks=scales.trans_breaks("log10", robj.r('function(x) 10^x')), labels=scales.trans_format("log10", robj.r('math_format(10^.x)'))) + \ r.xlab("CLIP-Seq samples") + \ r.ylab("No. reads") + \ ggplot2.coord_flip() + \ ggplot2.facet_wrap(Formula("~ variable"), ncol=1) + \ theme(**{"panel.grid.major.x": element_blank(), "panel.grid.minor.x": element_blank(), "panel.grid.major.y": theme_line(size=0.5,colour="grey66",linetype=3)}) p.plot() return r.par(mfrow=np.array([1,2])) num_samples = len(qc_df.num_reads) r.par(bty="n", lwd=1.7, lty=2) r_opts = r.options(scipen=4) r.options(r_opts) r.dotchart(convert_to_r_matrix(qc_df[["num_reads", "num_mapped", "num_unique_mapped"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8) r.par(bty="n") r.dotchart(convert_to_r_matrix(qc_df[["num_ribosub_mapped", "num_ribo", "num_junctions"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8)
def plot_cels(expr, expt_names, expt_name_idx, cel_names, outdir=None): """Makes correlation plots between CEL files for the same cell type""" fsize = 10 names_1 = [] names_2 = [] cors = [] titles = [] for ex_idx, ex in enumerate(expt_names): # Indices of CEL files (columns of expr) corresponding to that cell type tmp_idx = expt_name_idx[ex] plot_idx = 0 for i in range(len(tmp_idx)): name1 = re.sub('_', '.', cel_names[tmp_idx[i]]) for j in range(i + 1, len(tmp_idx)): name2 = re.sub('_', '.', cel_names[tmp_idx[j]]) plot_idx += 1 cor = np.corrcoef(expr[:, tmp_idx[i]], expr[:, tmp_idx[j]])[0, 1] names_1.append(name1) names_2.append(name2) cors.append(cor) titles.append(ex + '-' + str(plot_idx)) df = ro.DataFrame({ 'x': ro.FloatVector(expr[:, tmp_idx[i]]), 'y': ro.FloatVector(expr[:, tmp_idx[j]]) }) gp = ggplot2.ggplot(df) + ggplot2.aes_string(x = 'x', y = 'y') + \ ggplot2.geom_point(size = 1) + \ ggplot2.scale_x_continuous(name1) + ggplot2.scale_y_continuous(name2) + \ ggplot2.theme_bw() + ggplot2.ggtitle('{:s}-{:d} ({:.4f})'.format(ex, plot_idx, cor)) + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.title.x':ggplot2.element_text(size = 8), 'axis.text.y':ggplot2.element_text(size = fsize), 'axis.title.y':ggplot2.element_text(size = 8, angle = 90), 'plot.title':ggplot2.element_text(size = fsize)}) if outdir is None: gp.plot() else: if not os.path.isdir(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, ex + '-' + str(plot_idx) + '.png') ro.r.ggsave(filename=outfile, plot=gp, width=85, height=85, unit='mm') df = pd.DataFrame({ 'name1': names_1, 'name2': names_2, 'cor': cors }, index=titles) if not outdir is None: df.to_csv(os.path.join(outdir, 'cor_summary.txt'), sep='\t') return df
ggplot2.scale_fill_gradient(high = 'blue', low = 'red') + \ ggplot2.scale_fill_continuous(name = "Obama Vote Share") + \ ggplot2.scale_colour_continuous(name = "Obama Vote Share") + \ ggplot2.opts(**{'legend.position': 'left', 'legend.key.size': robjects.r.unit(2, 'lines'), 'legend.title' : ggplot2.theme_text(size = 14, hjust=0), \ 'legend.text': ggplot2.theme_text(size = 12), 'title' : "Obama Vote Share and Distance to Railroads in IL", \ 'plot.title': ggplot2.theme_text(size = 24), 'plot.margin': robjects.r.unit(robjects.r.rep(0,4),'lines'), \ 'panel.background': ggplot2.theme_blank(), 'panel.grid.minor': ggplot2.theme_blank(), 'panel.grid.major': ggplot2.theme_blank(), \ 'axis.ticks': ggplot2.theme_blank(), 'axis.title.x': ggplot2.theme_blank(), 'axis.title.y': ggplot2.theme_blank(), \ 'axis.title.x': ggplot2.theme_blank(), 'axis.title.x': ggplot2.theme_blank(), 'axis.text.x': ggplot2.theme_blank(), \ 'axis.text.y': ggplot2.theme_blank()} ) + \ ggplot2.geom_line(ggplot2.aes(x='long', y='lat', group='group'), data=IL_railroads, color='grey', size=0.2) + \ ggplot2.coord_equal() p_map.plot() ## add the scatterplot ## define layout of subplot with viewports vp_sub = grid.viewport(x=0.19, y=0.2, width=0.32, height=0.4) p_sub = ggplot2.ggplot(RR_distance) + \ ggplot2.aes_string(x = 'OBAMA_SHAR', y= 'NEAR_DIST') + \ ggplot2.geom_point(ggplot2.aes(color='OBAMA_SHAR')) + \ ggplot2.stat_smooth(color="black") + \ ggplot2.opts(**{'legend.position': 'none'}) + \ ggplot2.scale_x_continuous("Obama Vote Share") + \ ggplot2.scale_y_continuous("Distance to nearest Railroad") p_sub.plot(vp=vp_sub) grdevices.dev_off()
number_of_peaks = len(dataf[0]) cvI = [] newRow = [] for i in range(1,number_of_peaks+1): row = dataf.rx(i,True) rowA = np.array(row) newRow.append(rowA[2:]) cvI.append(cv(rowA[2:])) #cv.append(rowA[2:].std()/rowA[2:].mean()) cv_r=robjects.conversion.py2ri(cvI) df_cv = {'CV' : cv_r} dataf_cv = robjects.DataFrame(df_cv) dtf_cv = robjects.r.melt(dataf_cv) d=dataf.cbind(dtf_cv.rx(2)) d.names[tuple(d.colnames).index('value')] = 'CV' #d = base.merge_data_frame(dataf,dtf_cv.rx(2)) utilis.write_csv(d, options.csv_output) dc = dtf_cv.cbind(n_peak = robjects.IntVector(range(1,number_of_peaks+1))) #n_peak = robjects.IntVector(1,number_of_peaks) gp = ggplot2.ggplot(dc) pp=gp+ggplot2.aes_string(x='n_peak',y='value') + ggplot2.geom_point()+ggplot2.theme_bw()+ ggplot2.ggtitle('Coefficient of Variation')+ \ ggplot2.scale_x_continuous("Number of Peaks")+ ggplot2.scale_y_continuous("CV") r.X11() pp.plot()
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) # make a data frame of the lists d = {'start': r_start_times, 'lengths': r_read_lengths, 'cumul': cumulative} df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 8.5, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 8.5, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def main(): usage = 'usage: %prog [options] <raw file>' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide raw file') else: raw_file = args[0] # collect data coords = [] main_cov = [] control_cov = [] for line in open(raw_file): a = line.split() coords.append(int(a[0])) main_cov.append(float(a[1])) control_cov.append(float(a[2])) # data structures tss_i = ro.IntVector(range(-options.upstream, options.downstream + 1)) labels = ro.StrVector(['Main'] * (options.upstream + options.downstream + 1) + ['Control'] * (options.upstream + options.downstream + 1)) cov = ro.FloatVector(main_cov + control_cov) df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_colour_discrete('') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ ggplot2.scale_x_continuous('TSS Position') + \ ggplot2.scale_colour_discrete('') + \ ggplot2.theme_bw() if options.ymax == None: gp += ggplot2.scale_y_continuous('Coverage') else: gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector( [0, options.ymax])) # save to file grdevices.pdf(file='%s_and.pdf' % options.out_prefix) gp.plot() grdevices.dev_off()
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) step = args.skip # make a data frame of the lists d = { 'start': robjects.FloatVector( [r_start_times[n] for n in xrange(0, len(r_start_times), step)]), 'lengths': robjects.IntVector( [r_read_lengths[n] for n in xrange(0, len(r_read_lengths), step)]), 'cumul': robjects.IntVector( [cumulative[n] for n in xrange(0, len(cumulative), step)]) } df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector( [x[1] for x in combos_r]) d['group'] = StrVector( [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.opts(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width=712, height=512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'), data=dataf,
def plot_volcano_with_r( data, xlabel='Estimated effect (change in H/L ratio)', title='', max_labels=20, color_background='#737373', color_significant='#252525', color_significant_muted='#252525', label_only_large_fc=False, special_labels=None, special_palette=None, base_size=12, label_size=3, x='logFC', y='neg_log10_p_adjust', special_labels_mode='all', xlim=None, skip_labels=None, nudges=None, ): r_data, r_like_data = transform_data_for_ggplot( data, label_only_large_fc=label_only_large_fc, special_labels=special_labels, max_labels=max_labels, special_labels_mode=special_labels_mode, skip_labels=skip_labels, nudges=nudges) plot = r_ggplot2.ggplot(r_data) plot += r_ggplot2.theme_minimal(base_size=base_size) plot += r_ggplot2.theme( **{ 'panel.grid.major': r_ggplot2.element_blank(), 'panel.grid.minor': r_ggplot2.element_blank(), 'panel.border': r_ggplot2.element_rect(fill=robjects.rinterface.NA, color="black") }) plot += r_ggplot2.theme( text=r_ggplot2.element_text(family='Helvetica', face='plain')) plot += r_ggplot2.theme( **{ 'plot.title': r_ggplot2.element_text(hjust=0.5), # 'axis.title.y': r_ggplot2.element_text((t = 0, r = 20, b = 0, l = 0)), }) aes_points = r_ggplot2.aes_string(x=x, y=y, color='group') scale_points = r_ggplot2.scale_colour_manual( aes_points, values=r_label_palette( r_like_data, special_palette, color_background=color_background, color_significant=color_significant, color_significant_muted=color_significant_muted)) plot += aes_points plot += scale_points if xlim is not None: plot += r_ggplot2.scale_x_continuous( labels=r_custom.formatterFunTwoDigits, limits=robjects.r.c(*xlim)) else: plot += r_ggplot2.scale_x_continuous( labels=r_custom.formatterFunTwoDigits) plot += r_ggplot2.scale_y_continuous(labels=r_custom.formatterFunOneDigit) plot += r_ggplot2.geom_hline( yintercept=float(-np.log10(FDR_THRESHOLD_RESPONSE)), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_vline(xintercept=float(FC_THRESHOLD_RESPONSE), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_vline(xintercept=-float(FC_THRESHOLD_RESPONSE), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_point(**{'show.legend': False}) aes_text = r_ggplot2.aes_string(label='label') plot += aes_text plot += r_ggrepel.geom_text_repel( aes_text, nudge_x=r_dollar(r_data, 'nudgex'), nudge_y=r_dollar(r_data, 'nudgey'), size=label_size, family='Helvetica', **{ 'show.legend': False, 'point.padding': 0.25, 'min.segment.length': 0, #'max.iter':0, 'segment.color': '#BDBDBD' }, ) plot += r_ggplot2.labs(x=xlabel, y='Adjusted p value (-log10)', title=title) plot.plot()
##text_log+="average: "+str(rmean(test23)[0])+end ##text_log+="sum: "+str(rsum(test23)[0])+end # #roughbin= round(ma[0]/100) #bins=round(roughbin/100)*100 #ma2=rmax(ed) #dataf_subset = dataf.rx(dataf.rx2("contig").ro >= 18, true) scales = importr('scales') gp = ggplot2.ggplot(dataf) #geom_histogram(aes(y = ..density..)) # ggplot2.geom_density()+\ # pp = gp + ggplot2.aes_string(x='%s(contrrr)') + ggplot2.geom_histogram()+ggplot2.scale_y_sqrt() bins=10 teest3=robjects.r('theme(axis.text.x=element_text(angle=90))') pp = gp + \ ggplot2.aes_string(x='Length') + \ ggplot2.geom_histogram()+\ ggplot2.ggtitle("Found IS fragment lengths")+ \ ggplot2.scale_x_continuous(name="fragment lengths, bin="+str(bins),breaks=scales.pretty_breaks(20)) +\ ggplot2.scale_y_continuous(labels=scales.comma,name="Count",breaks=scales.pretty_breaks(10))+ \ teest3 pp.plot() robjects.r.ggsave("/Users/security/science/dna_subj_hist.pdf")