def interval(locus_table, interval_table, intervals, loci, boxplot = True): qry = get_interval_query(intervals, loci, locus_table, interval_table) frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry)) # because we're sorting by interval, which is a factor, we need to # explicitly re-sort the data by the first integer value # of the interval. This is a bit cumbersome, because sorting # in R is less than pleasant. sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) if boxplot: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(**{ 'outlier.size':0, 'alpha':0.3 } ) + \ ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \ alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') else: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi', fill='locus') + ggplot2.geom_bar() + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{ 'axis.text.x':ggplot2.theme_text(angle = -90, hjust = 0), 'legend.position':'none' }) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') return plot
def _plt_distr(dat, col, title='', splitBy_pfill=True, pfill='label', independentpdf=False, fname='xdistr.pdf'): df = dat[dat[pfill] != 'NA'] ## remove invalid pairs n = len(df) df = { col: robjects.FloatVector(list(df[col])), pfill: robjects.StrVector(list(df[pfill])) } df = robjects.DataFrame(df) pp = ggplot2.ggplot(df) + \ ggplot2.ggtitle('%s [Total = %s]' % (title, n)) ## Plot1: counts if splitBy_pfill: p1 = pp + ggplot2.aes_string(x=col, fill=pfill) else: p1 = pp + ggplot2.aes_string(x=col) ## Plot2: density if splitBy_pfill: p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..') else: p2 = pp + ggplot2.aes_string(x=col, y='..density..') p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500) if col == 'distance': p1 = p1 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) p2 = p2 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) else: p1 = p1 + \ ggplot2.geom_histogram(alpha=.5, position='identity') p2 = p2 + \ ggplot2.geom_histogram(alpha=.33, position='identity') if col == 'correlation': p1 = p1 + ggplot2.xlim(-1.1, 1.1) p2 = p2 + ggplot2.xlim(-1.1, 1.1) if independentpdf: grdevices = importr('grDevices') grdevices.pdf(file=fname) p1.plot() p2.plot() grdevices.dev_off() else: p1.plot() p2.plot() return
def make_output(tss_cov, out_prefix, upstream, downstream): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-upstream,downstream+1): print >> raw_out, '%d\t%e' % (i, tss_cov[upstream+i]) raw_out.close() # make plot data structures tss_i = ro.IntVector(range(-upstream,downstream+1)) cov = ro.FloatVector(tss_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_full.pdf' % out_prefix) gp.plot() grdevices.dev_off() # construct zoomed plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index',limits=ro.IntVector([-1000,1000])) + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_zoom.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width = 8.5, height = 11) elif args.saveas == "png": grdevices.png(plot_file, width = 8.5, height = 11, units = "in", res = 300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def gray_plot(data, min=0, max=1, name=""): reshape = importr('reshape') gg = ggplot2.ggplot(reshape.melt(data,id_var=['x','y'])) pg = gg + ggplot2.aes_string(x='L1',y='L2')+ \ ggplot2.geom_tile(ggplot2.aes_string(fill='value'))+ \ ggplot2.scale_fill_gradient(low="black", high="white",limits=FloatVector((min,max)))+ \ ggplot2.coord_equal() + ggplot2.scale_x_continuous(name) return pg
def gray_plot(data, min=0, max=1, name=""): reshape = importr('reshape') gg = ggplot2.ggplot(reshape.melt(data, id_var=['x', 'y'])) pg = gg + ggplot2.aes_string(x='L1',y='L2')+ \ ggplot2.geom_tile(ggplot2.aes_string(fill='value'))+ \ ggplot2.scale_fill_gradient(low="black", high="white",limits=FloatVector((min,max)))+ \ ggplot2.coord_equal() + ggplot2.scale_x_continuous(name) return pg
def line_plot(pdf_file, data, x, y, var, null_label="N/A", linetype=None, title=None, xlab=None, ylab=None, colorname=None, linename=None, **extra_aes_params): pdf(pdf_file, width=11.7, height=8.3, paper="a4r") if any(data[x].isnull()): labels = [null_label] + map(str, sorted(set( data[data[x].notnull()][x]))) labels = robjects.StrVector(labels) nulls = data[x].isnull() label_vals = dict(zip(labels, range(len(labels)))) data[x] = data[x].astype("str") data[x][nulls] = null_label data['sortcol'] = data[x].map(label_vals.__getitem__) data.sort('sortcol', inplace=True) else: labels = None if linetype and linetype != var: data['group'] = data[var].map(str) + data[linetype].map(str) else: data['group'] = data[var] rdata = common.convert_to_r_dataframe(data) if labels: ix = rdata.names.index(x) rdata[ix] = ordered(rdata[ix], levels=labels) gp = gg2.ggplot(rdata) pp = ( gp + gg2.geom_point(size=3) + gg2.scale_colour_hue(name=(colorname or var)) + #gg2.scale_colour_continuous(low="black") + gg2.aes_string(x=x, y=y, color=var, variable=var) + ggtitle(title or "") + xlabel(xlab or x) + ylabel(ylab or y) #+ #gg2.scale_y_continuous(breaks=seq(0.0, 1.0, 0.05)) ) # line type stuff if linetype: pp += gg2.geom_path(gg2.aes_string(group='group', linetype=linetype), size=0.5) pp += gg2.scale_linetype(name=(linename or linetype)) else: pp += gg2.geom_path(gg2.aes_string(group='group'), size=0.5) pp.plot() dev_off()
def _plot_with_rpy2(self, regions, filename): from rpy2 import robjects import rpy2.robjects.lib.ggplot2 as ggplot2 from rpy2.robjects.lib import grid from rpy2.robjects.packages import importr grdevices = importr('grDevices') base = importr('base') grdevices.pdf(file=filename + '.pdf') t = [x for x in range(-self.num_bins, self.num_bins + 1)] for region in regions[:self.num_regs]: if not np.any(region.weighted): logger.warning( "Warning: No data for region located on bin " + str(region.bin) + ". Not plotting this one.") continue middle = (len(region.weighted[0]) - 1) / 2 if middle < self.num_bins: logger.error("Warning: There are less bins calculated for regions than you want to plot.") sys.exit(1) d = {'map': robjects.StrVector( [str(m) for sublist in [[x] * len(t) for x in range(len(region.weighted))] for m in sublist]), 't': robjects.FloatVector(t * len(region.weighted)), 'e': robjects.FloatVector([i for sublist in region.weighted for i in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'p': robjects.FloatVector([-np.log10(x) for sublist in region.pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'c': robjects.FloatVector([-np.log10(x) for sublist in region.corrected_pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]])} dataf = robjects.DataFrame(d) gp = ggplot2.ggplot(dataf) # first yellow second red p1 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='e', group='map', colour='map'), alpha=0.8) + ggplot2.scale_y_continuous(trans='log2') + ggplot2.ggtitle( "\n".join(wrap("Bin " + str(region.bin) + " : " + str(region.positions)))) + ggplot2.labs( y="log Intensity") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.y': ggplot2.element_text(angle=45), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p2 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='p', group='map', colour='map'), alpha=0.8) + ggplot2.labs( y="-log10(p-value)") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p3 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='c', group='map', colour='map'), alpha=0.8) + ggplot2.labs(y="-log10(q-value)", x='bins (' + str(self.bin_res) + ' bp each)') + \ ggplot2.geom_hline(mapping=ggplot2.aes_string(yintercept=str(-np.log10(self.threshold))), colour='black', alpha=0.8, linetype='dashed') + ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") g1 = ggplot2.ggplot2.ggplotGrob(p1) g2 = ggplot2.ggplot2.ggplotGrob(p2) g3 = ggplot2.ggplot2.ggplotGrob(p3) robjects.globalenv["g"] = base.rbind(g1, g2, g3, size='first') robjects.r("grid::grid.draw(g)") grid.newpage() logger.debug('Plotted region ' + str(region.bin)) grdevices.dev_off()
def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = { "rownum": robjects.IntVector(range(1, 17) * 32), "colnum": robjects.IntVector(sorted(range(1, 33) * 16)), "log10_tot_bp": robjects.IntVector(pore_values), "labels": robjects.IntVector(flowcell_layout), } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = ( gp + gg.aes_string(y="factor(rownum, rev(rownum))", x="factor(colnum)") + gg.geom_point(gg.aes_string(color="log10_tot_bp"), size=7) + gg.geom_text(gg.aes_string(label="labels"), colour="white", size=2) + gg.scale_colour_gradient2(low="black", mid="black", high="red") + gg.coord_fixed(ratio=1.4) + gg.labs(x=gg.NULL, y=gg.NULL) ) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=11, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=11, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print ("Type enter to exit.") raw_input()
def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = {'rownum': robjects.IntVector(range(1,17)*32), 'colnum': robjects.IntVector(sorted(range(1,33)*16)), 'log10_tot_bp': robjects.IntVector(pore_values), 'labels': robjects.IntVector(flowcell_layout) } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = gp + gg.aes_string(y = 'factor(rownum, rev(rownum))', \ x = 'factor(colnum)') \ + gg.geom_point(gg.aes_string(color='log10_tot_bp'), size = 7) \ + gg.geom_text(gg.aes_string(label ='labels'), colour="white", size = 2) \ + gg.scale_colour_gradient2(low = "black", mid= "black", high="red") \ + gg.coord_fixed(ratio=1.4) \ + gg.labs(x=gg.NULL, y=gg.NULL) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 11, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 11, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def create_plot(filename, data, performance_object): grdevices.png(file=filename) (ggplot2.ggplot(data) + ggplot2.aes_string( x="dimension", y="mean.%s" % performance_object) + ggplot2.geom_point(ggplot2.aes_string(colour="signature")) + ggplot2.geom_errorbar( ggplot2.aes_string( ymax="mean.%s+stderror.%s" % (performance_object, performance_object), ymin="mean.%s-stderror.%s" % (performance_object, performance_object), ))).plot() grdevices.dev_off()
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') sizes = robjects.IntVector([s for s in sizes \ if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {'sizes': sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def plot(data, filename, title, ggplotter, xid="N", yid="RunTime", factorid="Step"): df = make_dataframe(data, xid, yid, factorid) grdevices.pdf(file=filename, width=10, height=6) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x=xid, y=yid) + \ ggplot2.aes_string(size=.5) + \ ggplotter() + \ ggplot2.aes_string(colour='factor(%s)' % factorid) + \ ggplot2.aes_string(fill='factor(%s)' % factorid) + \ ggplot2.opts(title=title) + \ ggplot2.scale_fill_brewer(palette="Set2") + \ ggplot2.scale_colour_brewer(palette="Set2") pp.plot() grdevices.dev_off()
def makeDistancePlot( alldata, figurename, feature="distance") : alldata["distance"] = alldata.het + alldata.hom r_dataframe = com.convert_to_r_dataframe(alldata) p = ggplot2.ggplot(r_dataframe) + \ ggplot2.aes_string(x=feature ) + \ ggplot2.geom_density(ggplot2.aes_string(fill="factor(continent)")) + \ ggplot2.ggtitle("Distance from Reference by Continent") + \ ggplot2.theme(**mytheme) #+ \ #ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) + \ #ggplot2.facet_grid( robjects.Formula('RVIS_type ~ .') ) grdevices.png(figurename) p.plot() grdevices.dev_off()
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def make_output_and(cov, control_cov, out_prefix, window): # dump raw counts to file raw_out = open("%s_raw.txt" % out_prefix, "w") for i in range(-window / 2, window / 2 + 1): print >> raw_out, "%d\t%e\t%e" % (i, cov[window / 2 + i], control_cov[window / 2 + i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(2 * range(-window / 2, window / 2 + 1)) cov_r = ro.FloatVector(cov + control_cov) labels = ro.StrVector(["Main"] * len(cov) + ["Control"] * len(control_cov)) df = ro.DataFrame({"splice_i": splice_i, "cov": cov_r, "label": labels}) # construct plot gp = ( ggplot2.ggplot(df) + ggplot2.aes_string(x="splice_i", y="cov", colour="label") + ggplot2.geom_point() + ggplot2.scale_x_continuous("Position relative to splice site") + ggplot2.scale_y_continuous("Coverage") + ggplot2.scale_colour_discrete("") ) # plot to file grdevices.pdf(file="%s.pdf" % out_prefix) gp.plot() grdevices.dev_off()
def make_output(cov, out_prefix, window): # dump raw counts to file raw_out = open("%s_raw.txt" % out_prefix, "w") for i in range(-window / 2, window / 2 + 1): print >> raw_out, "%d\t%e" % (i, cov[window / 2 + i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(range(-window / 2, window / 2 + 1)) cov = ro.FloatVector(cov) df = ro.DataFrame({"splice_i": splice_i, "cov": cov}) # construct plot gp = ( ggplot2.ggplot(df) + ggplot2.aes_string(x="splice_i", y="cov") + ggplot2.geom_point() + ggplot2.scale_x_continuous("Position relative to splice site") + ggplot2.scale_y_continuous("Coverage") ) # plot to file grdevices.pdf(file="%s.pdf" % out_prefix) gp.plot() grdevices.dev_off()
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] data = data[['crawl', 'percentage', 'type']] categories = [] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) ratio = 0.1 + len(data['crawl'].unique()) * .03 # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] categories = [] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) data['size'] = data['size'].astype(float) ratio = 0.1 + len(data['crawl'].unique()) * .03 print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts\n(before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot(request): r = robjects.r ungram = Sentence.objects.filter(grammatical=False).exclude( rating='N').values_list('similarity', flat=True) gram = Sentence.objects.filter(grammatical=True).exclude( rating='N').values_list('similarity', flat=True) gram_r = robjects.FloatVector(gram) ungram_r = robjects.FloatVector(ungram) df = robjects.r["data.frame"] gram_df = df(gram="GRAM", similarity=gram_r) ungram_df = df(gram="UNGRAM", similarity=ungram_r) rbind = r['rbind'] data = rbind(gram_df, ungram_df) pp = ggplot2.ggplot(data) + \ ggplot2.aes_string(x="gram", y="similarity") + \ ggplot2.geom_boxplot() grdevices = importr('grDevices') grdevices.png(file="data.png", width=580, height=512) pp.plot() grdevices.dev_off() image_data = open("data.png", "rb").read() return HttpResponse(image_data, mimetype="image/png")
def plot(request): r = robjects.r ungram = Sentence.objects.filter(grammatical=False).exclude(rating='N').values_list('similarity', flat=True) gram = Sentence.objects.filter(grammatical=True).exclude(rating='N').values_list('similarity', flat=True) gram_r = robjects.FloatVector(gram) ungram_r = robjects.FloatVector(ungram) df = robjects.r["data.frame"] gram_df = df(gram="GRAM", similarity=gram_r) ungram_df = df(gram="UNGRAM", similarity=ungram_r) rbind = r['rbind'] data = rbind(gram_df, ungram_df) pp = ggplot2.ggplot(data) + \ ggplot2.aes_string(x="gram", y="similarity") + \ ggplot2.geom_boxplot() grdevices = importr('grDevices') grdevices.png(file="data.png", width=580, height=512) pp.plot() grdevices.dev_off() image_data = open("data.png", "rb").read() return HttpResponse(image_data, mimetype="image/png")
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) #parser.add_option() (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file') else: bam_file = args[0] align_lengths = {} for aligned_read in pysam.Samfile(bam_file, 'rb'): align_lengths[aligned_read.qlen] = align_lengths.get(aligned_read.qlen,0) + 1 min_len = min(align_lengths.keys()) max_len = max(align_lengths.keys()) # construct data frame len_r = ro.IntVector(range(min_len,max_len+1)) counts_r = ro.IntVector([align_lengths.get(l,0) for l in range(min_len,max_len+1)]) df = ro.DataFrame({'length':len_r, 'counts':counts_r}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='length', y='counts') + \ ggplot2.geom_bar(stat='identity') + \ ggplot2.scale_x_continuous('Alignment length') + \ ggplot2.scale_y_continuous('') # plot to file grdevices.pdf(file='align_lengths.pdf') gp.plot() grdevices.dev_off()
def makePlot(grdevices, plotName, samp_set1_vals, samp_set2_vals, image_file_type): samp_vector = ["set1" for i in range(len(samp_set1_vals))] samp_vector.extend(["set2" for i in range(len(samp_set2_vals))]) dframe = robjects.DataFrame({ "sample": robjects.StrVector(samp_vector), "value": robjects.FloatVector(samp_set1_vals + samp_set2_vals) }) gp = ggplot2.ggplot(dframe) pp = gp + \ ggplot2.aes_string(x="sample", y='value') + \ ggplot2.geom_boxplot() +\ ggplot2.geom_jitter() +\ ggplot2.theme_bw() if image_file_type == "pdf": grdevices.pdf(file=plotName) else: grdevices.png(file=plotName, width=512, height=512) pp.plot() grdevices.dev_off()
def line_plot(self, data, title, ylabel, img_file, x='date', y='size', c='type', clabel=''): if PLOTLIB == 'ggplot': # date_label = "%Y\n%b" date_label = "%Y\n%W" # year + week number p = ggplot(data, aes(x=x, y=y, color=c)) \ + ggtitle(title) \ + ylab(ylabel) \ + xlab(' ') \ + scale_x_date(breaks=date_breaks('3 months'), labels=date_label) \ + geom_line() + geom_point() elif PLOTLIB == 'rpy2.ggplot2': # convert y axis to float because R uses 32-bit signed integers, # values > 2 bln. (2^31) will overflow data[y] = data[y].astype(float) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x=x, y=y, color=c) \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='', y=ylabel, color=clabel) img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) # data.to_csv(img_path + '.csv') return p
def compare_sum_barplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) #pdb.set_trace() frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''') if len(intervals) > 1: sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''agg_data''')) plot = gg_frame + \ ggplot2.aes_string( x = 'interval', y = 'pi', fill='factor(db)' ) + \ ggplot2.geom_bar(**{ 'position':'dodge', 'colour':'#767676', 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('net phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette="Blues") return plot
def bargraph_language(results): r = robjects.r for language in languages: varis = [] probs = [] locs = [] for (lang, prob, var) in results.keys(): if lang == language: loc = results[(lang, prob, var)] varis.append(pretty_varis[var]) probs.append(prob) locs.append(loc) r.pdf('bargraph-loc-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Lines': IntVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code")') pp.plot() r['dev.off']()
def render_plot(gp, args): """Render a plot using ggplot :gp: A base ggplot2 object :x: The x value expression :y: The y value expression :type: The type of plot to make """ args = util.Namespace(args) import rpy2.robjects.lib.ggplot2 as ggplot2 pp = gp + ggplot2.aes_string(x=args.x, y=args.y) if args.type == 'points': pp += ggplot2.geom_point() elif args.type == 'lines': pp += ggplot2.geom_line() elif args.type == 'boxplot': pp += ggplot2.geom_boxplot() else: raise Exception("{0} not implemented".format(args.type)) if args.facets is not None: try: pp += ggplot2.facet_grid(ro.Formula(args.facets)) except Exception: pass try: pp.plot() except Exception: pass
def make_output_and(cov, control_cov, out_prefix, window): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-window/2,window/2+1): print >> raw_out, '%d\t%e\t%e' % (i, cov[window/2+i], control_cov[window/2+i]) raw_out.close() # make plot data structures splice_i = ro.IntVector(2*range(-window/2,window/2+1)) cov_r = ro.FloatVector(cov+control_cov) labels = ro.StrVector(['Main']*len(cov)+['Control']*len(control_cov)) df = ro.DataFrame({'splice_i':splice_i, 'cov':cov_r, 'label':labels}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='splice_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('Position relative to splice site') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf(file='%s.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream, downstream): # clean raw counts dir if os.path.isdir('%s_raw' % out_prefix): shutil.rmtree('%s_raw' % out_prefix) os.mkdir('%s_raw' % out_prefix) # dump raw counts to file for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: raw_out = open( '%s_raw/%s_%s.txt' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_')), 'w') for i in range(-upstream, downstream + 1): print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][ upstream + i], control_te_tss_cov[te][upstream + i]) raw_out.close() # clean plot dirs if os.path.isdir('%s_plot' % out_prefix): shutil.rmtree('%s_plot' % out_prefix) os.mkdir('%s_plot' % out_prefix) # make data structures tss_i = ro.IntVector(2 * range(-upstream, downstream + 1)) labels = ro.StrVector(['Main'] * (upstream + downstream + 1) + ['Control'] * (upstream + downstream + 1)) for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te]) df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf( file='%s_plot/%s_%s.pdf' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_'))) gp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <mut1 file> <mut2 file>' parser = OptionParser(usage) parser.add_option('-m', dest='mut_norm', action='store_true', default=False, help='Normalize by # mutations (as opposed to sequenced bp) [Default: %default]') parser.add_option('-o', dest='output_pdf', default='mut_cmp.pdf', help='Output pdf file for heatmap [Default: %default]') parser.add_option('-r', dest='raw', action='store_true', default=False, help='Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: mut1_file = args[0] mut2_file = args[1] mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw) mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw) relative_mutation_profile = compute_relative_profile(mutation_profile1, seq_bp1, mutation_profile2, seq_bp2) print_table(relative_mutation_profile) # make plotting data structures nts = ['_','A','C','G','T'] nts1 = [] nts2 = [] rel = [] for nt1 in nts: for nt2 in nts: nts1.append(nt1) nts2.append(nt2) rel.append(relative_mutation_profile[(nt1,nt2)]) nts1_r = ro.StrVector(nts1) nts2_r = ro.StrVector(nts2) rel_r = ro.FloatVector(rel) df = ro.DataFrame({'nt1':nts1_r, 'nt2':nts2_r, 'rel':rel_r}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete(mut2_file, limits=nts) + \ ggplot2.scale_y_discrete(mut1_file, limits=nts) + \ ggplot2.scale_fill_gradient('Enrichment 1/2') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete('Read') + \ ggplot2.scale_y_discrete('Reference') + \ ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred') # save to file grdevices.pdf(file=options.output_pdf) gp.plot() grdevices.dev_off()
def BoxPlot_One(self, metabolite): #print(self.raw_data) r('graphics.off()') gp = ggplot2.ggplot(self.raw_data) pp = gp + \ ggplot2.aes_string(x=self.metadata, y='`'+self.metabolite_dict[metabolite]+'`') + \ ggplot2.geom_boxplot() pp.plot()
def multiple_locus_net_informativeness_scatterplot(locus_table, net_pi_table, loci): if loci[0].lower() != 'all': qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id and locus in {2}"'''.format(locus_table, net_pi_table, tuple(loci)) else: qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id"'''.format(locus_table, net_pi_table) frame = robjects.r('''dbGetQuery(con, {})'''.format(qry)) gg_frame = ggplot2.ggplot(frame) plot = gg_frame + ggplot2.aes_string(x = 'time', y = 'pi') + \ ggplot2.geom_point(ggplot2.aes_string(colour = 'locus'), \ size = 3, alpha = 0.4) + ggplot2.scale_x_reverse('years ago') + \ ggplot2.scale_y_continuous('phylogenetic informativeness') return plot
def plot(data, x, y, ylabel, color, filename): gp = ggplot2.ggplot(data=data) gp = gp + \ ggplot2.geom_line(ggplot2.aes_string(x=x, y=y), color=color) + \ ggplot2.theme(**{'axis.text.x' : ggplot2.element_text(angle = 90, hjust = 1), 'strip.text.y' : ggplot2.element_text(size = 6, angle=90)}) + \ ggplot2.scale_y_continuous(ylabel) ggplot2.ggplot2.ggsave(filename, gp)
def barPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=x_parm, y=y_parm) geom = ggplot2.geom_bar(stat="identity") gg = data + aes + geom gg.plot() grdevices.dev_off()
def boxPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=x_parm,y=y_parm,) geom = ggplot2.geom_boxplot(alpha = 0.7,fill="aquamarine3") gg = data + aes + geom gg.plot() grdevices.dev_off()
def barPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=x_parm,y=y_parm) geom = ggplot2.geom_bar(stat = "identity") gg = data + aes + geom gg.plot() grdevices.dev_off()
def plot_ROC(self, path): robjects.r["pdf"](path, width=14, height=8) df = self.df # print(df) gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True)) gp += ggplot2.aes_string(x="fpr", y="tpr") gp += ggplot2.geom_line(color="blue") gp += ggplot2.geom_point(size=2) gp.plot()
def plot_start(x, y): import rpy2.robjects.lib.ggplot2 as ggplot2 ##由于这一条import会有警告信息,放到这里,只有调用这个函数才会出现警告。 utils = importr('utils') data = utils.read_csv(glob('*.csv')[0]) plot = ggplot2.ggplot(data) plot = (plot + ggplot2.aes_string(x=x, y=y) + ggplot2.geom_point() + ggplot2.scale_colour_gradient(low="yellow", high="red") + ggplot2.labs(title="mtcars", x='wt', y='mpg')) plot.save('point.png')
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm,fill = group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def compare_mean_boxplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) if len(intervals) > 1: sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(ggplot2.aes_string(fill = 'factor(db)'), **{ 'outlier.size':3, 'outlier.colour':'#767676', 'outlier.alpha':0.3, 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('mean phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette='Blues') return plot
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm, fill=group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count, file_name): columns_to_data = {'place': [], 'pos': [], 'count': []} max_count = 0 for place_pos_type, n_count in place_type_pos_type_to_count.items(): place_type, pos_type = place_pos_type.split('_') columns_to_data['place'].append(place_type) columns_to_data['pos'].append(pos_type) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'place': ro.StrVector(columns_to_data['place']), 'pos': ro.StrVector(columns_to_data['pos']), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) if max_count > 1000: max_count = int(max_count / 1000 * 1000 + 1000) else: max_count = int(max_count / 100 * 100 + 100) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='pos', y='count', fill='place') + \ ggplot2.geom_bar(position="dodge", stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2, vjust=-0.5) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def plot_ROC(self, path): robjects.r['pdf'](path, width=14, height=8) df = self.df print(df) gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True)) gp += ggplot2.aes_string(x='fpr', y='tpr') gp += ggplot2.geom_line(color='blue') gp += ggplot2.geom_point(size=2) gp.plot()
def plot_hist(sizes, args): """ Use rpy2 to plot a histogram of the read sizes """ r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") sizes = robjects.IntVector([s for s in sizes if s < args.max_length and s > args.min_length]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / args.num_bins d = {"sizes": sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) else: pp = gp + ggplot2.aes_string(x="sizes") + ggplot2.geom_histogram(binwidth=binwidth) + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print("Type enter to exit.") raw_input()
def rank_abundance_plot(counter, name): grdevices.png('analytics_out/{0}_rank_abundance.png'.format(name)) ranks, fracs = rank_abundance_data(counter) df = robjects.DataFrame({'rank': ranks, 'f': fracs}) pp = ggplot.ggplot(df) + \ ggplot.aes_string(x = 'rank', y = 'f') + \ ggplot.geom_point() + \ ggplot.scale_y_log10(name = 'fraction of hits') pp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <raw file>' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide raw file') else: raw_file = args[0] # collect data coords = [] main_cov = [] control_cov = [] for line in open(raw_file): a = line.split() coords.append(int(a[0])) main_cov.append(float(a[1])) control_cov.append(float(a[2])) # data structures tss_i = ro.IntVector(range(-options.upstream,options.downstream+1)) labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1)) cov = ro.FloatVector(main_cov + control_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_colour_discrete('') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ ggplot2.scale_x_continuous('TSS Position') + \ ggplot2.scale_colour_discrete('') + \ ggplot2.theme_bw() if options.ymax == None: gp += ggplot2.scale_y_continuous('Coverage') else: gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax])) # save to file grdevices.pdf(file='%s_and.pdf' % options.out_prefix) gp.plot() grdevices.dev_off()
def boxPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string( x=x_parm, y=y_parm, ) geom = ggplot2.geom_boxplot(alpha=0.7, fill="aquamarine3") gg = data + aes + geom gg.plot() grdevices.dev_off()
def plot_similarity_matrix(self, item_type, image_file, title): '''Plot similarities of crawls (overlap of unique items) as heat map matrix''' data = defaultdict(dict) n = 1 for crawl1 in self.similarity[item_type]: for crawl2 in self.similarity[item_type][crawl1]: similarity = self.similarity[item_type][crawl1][crawl2] data['crawl1'][n] = MonthlyCrawl.short_name(crawl1) data['crawl2'][n] = MonthlyCrawl.short_name(crawl2) data['similarity'][n] = similarity data['sim_rounded'][n] = similarity # to be rounded n += 1 data = pandas.DataFrame(data) print(data) # select median of similarity values as midpoint of similarity scale midpoint = data['similarity'].median() decimals = 3 textsize = 2 minshown = .0005 if (data['similarity'].max()-data['similarity'].min()) > .2: decimals = 2 textsize = 2.8 minshown = .005 data['sim_rounded'] = data['sim_rounded'].apply( lambda x: ('{0:.'+str(decimals)+'f}').format(x).lstrip('0') if x >= minshown else '0') print('Median of similarities for', item_type, '=', midpoint) matrix_size = len(self.similarity[item_type]) if matrix_size > self.MAX_MATRIX_SIZE: n = 0 for crawl1 in sorted(self.similarity[item_type], reverse=True): short_name = MonthlyCrawl.short_name(crawl1) if n > self.MAX_MATRIX_SIZE: data = data[data['crawl1'] != short_name] data = data[data['crawl2'] != short_name] n += 1 p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl2', y='crawl1', fill='similarity', label='sim_rounded') \ + ggplot2.geom_tile(color="white") \ + ggplot2.scale_fill_gradient2(low="red", high="blue", mid="white", midpoint=midpoint, space="Lab") \ + GGPLOT2_THEME \ + ggplot2.coord_fixed() \ + ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle=45, vjust=1, hjust=1)}) \ + ggplot2.labs(title=title, x='', y='') \ + ggplot2.geom_text(color='black', size=textsize) img_path = os.path.join(PLOTDIR, image_file) p.save(img_path) return p
def plot_all_errors(self, path): # print self.error_matrix[0] robjects.r["pdf"](path, width=14, height=8) df = pandas.melt(self.df, id_vars="iteration") gp = ggplot2.ggplot(convert_to_r_dataframe(df, strings_as_factors=True)) x_col = "iteration" gp += ggplot2.aes_string(x=x_col, y="value", color="variable") gp += ggplot2.geom_point(size=2) gp += ggplot2.geom_line() gp.plot()
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile=None, height=120, fsize=12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({ 'feature': np.repeat(feat_names, nexpt), 'Classification': np.reshape(clf_coef, (clf_coef.size, )), 'Regression': np.reshape(reg_coef, (reg_coef.size, )) }) df2 = pd.melt(df, id_vars='feature', var_name='fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm') return df
def plotStats(data, outFolder, tiles, prop="qual", prefix="", high="yellow", low="blue", pdf=False, detail=True): #overview plot p = ggplot.ggplot(data) p = p + ggplot.aes_string(x="x", y="y", col=prop) \ + ggplot.geom_point(size=0.1) \ + ggplot.facet_wrap(robjects.Formula("~ tile")) \ + ggplot.scale_colour_gradient(high=high, low=low) \ + ggplot.ggtitle("Overview %s" % (prop)) if prefix: fileName = "%s_overview_%s.png" % (prefix, prop) else: fileName = "overview_%s.png" % (prop) p.save(os.path.join(outFolder, fileName), scale=2) #detail plots if detail: detailFolder = os.path.join(outFolder, "detailPlots") for t in tiles: p = ggplot.ggplot(data.rx(data.rx2("tile").ro == t, True)) p = p + ggplot.aes_string(x="x", y="y", col=prop) \ + ggplot.geom_point(size=1) \ + ggplot.facet_wrap(robjects.Formula("~ tile")) \ + ggplot.scale_colour_gradient(high=high, low=low) \ + ggplot.ggtitle("%i %s" % (t, prop)) if prefix: fileName = "%s_%i_%s.png" % (prefix, t, prop) else: fileName = "%i_%s.png" % (t, prop) p.save(os.path.join(detailFolder, fileName), scale=2) if pdf: fileName = "%s%i_%s.pdf" % (prefix, t, prop) p.save(os.path.join(detailFolder, fileName), scale=2)
def plot_cv_r2(pandas_df, outfile, fsize = 10, height = 120, max_width = 50, xlab = 'Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
def runBoruta(): base.load("Rcode/zscores.RData") base.source('Z:/Cristina/MassNonmass/codeProject/codeBase/trainClassifier/Rcode/borutaRelevance.R') outputBoruta = globalenv['findRelevant'](globalenv['massallfeatures'], globalenv['nonmassallfeatures']) # generate boxplot comparison of relevant mass features vs. the same non-mass feature plotgp = ggplot2.ggplot(outputBoruta.rx2("masszscore_selected")) + \ ggplot2.aes_string(x='MorN', y='zscores', fill = 'factor(MorN)') + \ ggplot2.geom_boxplot() + \ ggplot2.opts(title = "Comparison of Z-scores for Mass confirmed features", y="Z-scores") plotgp.plot() return
def single_locus_net_informativeness(locus_table, net_pi_table, locus): qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id AND locus = '{2}'"'''.format(locus_table, net_pi_table, locus) frame = robjects.r('''dbGetQuery(con, {})'''.format(qry)) gg_frame = ggplot2.ggplot(frame) plot = gg_frame + ggplot2.aes_string(x = 'time', y='pi') + \ ggplot2.geom_point(size = 3, alpha = 0.4) + \ ggplot2.scale_x_reverse('years ago') + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.opts(title = locus) return plot
def compute(x0, y0, x1, y1): # Selected square sel = 255 - average(img[y0:y1,x0:x1],2) # Average across x’s ysel = average(sel,1) line = ysel xs = mgrid[0:line.shape[0]] # Pass the data to R rxs = robjects.FloatVector(xs) rys = robjects.FloatVector(line) rdf = robjects.DataFrame({'x': rxs, 'y': rys}) robjects.globalenv['xs'] = rxs robjects.globalenv['df'] = rdf #print(rys.r_repr()) # Fir an R model robjects.r('''fit <- nls(y ~ (off + c1 * exp(-(x-mu1)**2/(2*sg1**2)) + c2 * exp(-(x-mu2)**2/(2*sg2**2))), data=df, start=list(off = 90, c1=120, mu1=30, sg1=10, c2=120, mu2=60, sg2=10), algorithm='port')''') # Get fit results robjects.r('''k <- coef(fit) fitdat <- data.frame(x=xs) fitdat$y <- predict(fit, newdata=fitdat) ## Independent Gaussians fitg1 <- data.frame(x=xs) fitg1$y <- k[['off']] + k[['c1']] * exp(-(xs-k[['mu1']])**2/(2*k[['sg1']]**2)) fitg2 <- data.frame(x=xs) fitg2$y <- k[['off']] + k[['c2']] * exp(-(xs-k[['mu2']])**2/(2*k[['sg2']]**2))''') # Plot R fits fitdat = robjects.globalenv['fitdat'] fitg1 = robjects.globalenv['fitg1'] fitg2 = robjects.globalenv['fitg2'] pp = ggplot2.ggplot(rdf) \ + ggplot2.aes_string(x='x', y='y') \ + ggplot2.geom_point() \ + ggplot2.geom_smooth(data=fitdat, stat="identity", size=1.5) \ + ggplot2.geom_smooth(data=fitg1, stat="identity") \ + ggplot2.geom_smooth(data=fitg2, stat="identity") pp.plot() # Compute the ratio of the gaussian integrals ratio = robjects.r('''k[['c1']]*k[['sg1']]/(k[['c1']]*k[['sg1']]+k[['c2']]*k[['sg2']])''') title('ratio='+str(ratio)) show()
def bargraph_variation_diff(): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: error = False try: time = result[lang][prob][standard] except KeyError: error = True try: time_expert = result[lang][prob][expert] except KeyError: error = True if not error: diff = (float(time_expert + time) / float(time) - 1) else: diff = 0 langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-codingtime-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('ylab("Coding time difference (in percent)")') +\ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()