def _plt_distr(dat, col, title='', splitBy_pfill=True, pfill='label', independentpdf=False, fname='xdistr.pdf'): df = dat[dat[pfill] != 'NA'] ## remove invalid pairs n = len(df) df = { col: robjects.FloatVector(list(df[col])), pfill: robjects.StrVector(list(df[pfill])) } df = robjects.DataFrame(df) pp = ggplot2.ggplot(df) + \ ggplot2.ggtitle('%s [Total = %s]' % (title, n)) ## Plot1: counts if splitBy_pfill: p1 = pp + ggplot2.aes_string(x=col, fill=pfill) else: p1 = pp + ggplot2.aes_string(x=col) ## Plot2: density if splitBy_pfill: p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..') else: p2 = pp + ggplot2.aes_string(x=col, y='..density..') p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500) if col == 'distance': p1 = p1 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) p2 = p2 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) else: p1 = p1 + \ ggplot2.geom_histogram(alpha=.5, position='identity') p2 = p2 + \ ggplot2.geom_histogram(alpha=.33, position='identity') if col == 'correlation': p1 = p1 + ggplot2.xlim(-1.1, 1.1) p2 = p2 + ggplot2.xlim(-1.1, 1.1) if independentpdf: grdevices = importr('grDevices') grdevices.pdf(file=fname) p1.plot() p2.plot() grdevices.dev_off() else: p1.plot() p2.plot() return
dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.labs(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width = 712, height = 512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'), data = dataf, na_action = stats.na_exclude)
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) # make a data frame of the lists d = {'start': r_start_times, 'lengths': r_read_lengths, 'cumul': cumulative} df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 8.5, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 8.5, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
d['group'] = StrVector( [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.opts(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width=712, height=512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'), data=dataf, na_action=stats.na_exclude)
def _plt_percountr(dat, independentpdf=False, fname='xpercount.pdf'): def _filt_dat(dat, item, getlabel=True): df = pd.DataFrame(dat[item].value_counts()) df.columns = ['count'] if getlabel: df['label'] = [ list(dat[dat[item] == i]['label'])[0] for i in df.index ] n = len(df) mx = max(df['count']) return df, n, mx dat = dat[dat['label'] != 'NA'] ## NUMBER OF MIRNA PER TSS df, n, mx = _filt_dat(dat, 'tss', False) df = {'count': robjects.IntVector(df['count'])} df = robjects.DataFrame(df) pt = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.aes_string(x='count') + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) pt_den = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='count', y='..density..') + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) ## NUMBER OF TSS PER MIRNA df, n, mx = _filt_dat(dat, 'mirna') df = { 'count': robjects.IntVector(df['count']), 'label': robjects.StrVector(df['label']) } df = robjects.DataFrame(df) _pm = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) _pm_den = ggplot2.ggplot(df) + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) ## not split by label pm = _pm + ggplot2.aes_string(x='count') pm_den = _pm_den + ggplot2.aes_string(x='count', y='..density..') ## split by label pms = _pm + ggplot2.aes_string(x='count', fill='label') pm_dens = _pm_den + ggplot2.aes_string( x='count', fill='label', y='..density..') ## add xlabelling (need to be added after aes_string) _xlab = ggplot2.labs(x='Number of TSS per miRNA (max = %s)' % mx) pm += _xlab pm_den += _xlab pms += _xlab pm_dens += _xlab if independentpdf: grdevices = importr('grDevices') grdevices.pdf(fname) pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() grdevices.dev_off() else: pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() return
def plot_collectors_curve(args, start_times, read_lengths): """ Use rpy2 to create a collectors curve of the run """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \ for t in start_times]) r_read_lengths = robjects.IntVector(read_lengths) # compute the cumulative based on reads or total base pairs if args.plot_type == 'reads': y_label = "Total reads" cumulative = \ r.cumsum(robjects.IntVector([1] * len(start_times))) elif args.plot_type == 'basepairs': y_label = "Total base pairs" cumulative = r.cumsum(r_read_lengths) step = args.skip # make a data frame of the lists d = { 'start': robjects.FloatVector( [r_start_times[n] for n in xrange(0, len(r_start_times), step)]), 'lengths': robjects.IntVector( [r_read_lengths[n] for n in xrange(0, len(r_read_lengths), step)]), 'cumul': robjects.IntVector( [cumulative[n] for n in xrange(0, len(cumulative), step)]) } df = robjects.DataFrame(d) if args.savedf: robjects.r("write.table")(df, file=args.savedf, sep="\t") # title total_reads = len(read_lengths) total_bp = sum(read_lengths) plot_title = "Yield: " \ + str(total_reads) + " reads and " \ + str(total_bp) + " base pairs." # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='start', y='cumul') \ + ggplot2.geom_step(size=2) \ + ggplot2.scale_x_continuous('Time (hours)') \ + ggplot2.scale_y_continuous(y_label) \ + ggplot2.ggtitle(plot_title) # extrapolation if args.extrapolate: start = robjects.ListVector({'a': 1, 'b': 1}) pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls', formula='y~a*I((x*3600)^b)', se='FALSE', start=start) \ + ggplot2.xlim(0, float(args.extrapolate)) if args.theme_bw: pp = pp + ggplot2.theme_bw() if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=8.5, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=8.5, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
hour_speed['mph'] = 3600 * (hour_speed.trip_distance / hour_speed.trip_time_in_secs) hour_speed # Pickups by hour (aggregated across all days) max_passenger_count = np.max(np.log1p(pickups_hr['passenger_count'])) min_passenger_count = np.log1p(1) for i in range(0,24): temp = pickups_hr[(pickups_hr.hour==i)].reset_index() temp.passenger_count = np.log1p(temp.passenger_count) temp_r = pandas2ri.py2ri(temp) p = ggplot2.ggplot(temp_r) + \ ggplot2.aes_string(x='rounded_lon', y='rounded_lat', color='passenger_count') + \ ggplot2.geom_point(size=0.5) + \ ggplot2.scale_color_gradient(low='black', high='white', limits=np.array([min_passenger_count, max_passenger_count])) + \ ggplot2.xlim(-74.2, -73.7) + ggplot2.ylim(40.56, 40.93) + \ ggplot2.labs(x=' ', y=' ', title='NYC taxi pickups %02i:00' % i) + \ ggplot2.guides(color=False) p.save('./plots/taxi_pickups%02i.png' % i, width=4, height=4.5) # Create animated .gif of pickups by hour import imageio file_names = sorted((fn for fn in os.listdir('./plots') if fn.startswith('taxi_pickups'))) file_names = ['plots/' + s for s in file_names] images = [] for filename in file_names: images.append(imageio.imread(filename)) imageio.mimsave('./plots/pickups_movie.gif', images, duration=0.4) # total pickups by date, color