def interval(locus_table, interval_table, intervals, loci, boxplot = True): qry = get_interval_query(intervals, loci, locus_table, interval_table) frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry)) # because we're sorting by interval, which is a factor, we need to # explicitly re-sort the data by the first integer value # of the interval. This is a bit cumbersome, because sorting # in R is less than pleasant. sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) if boxplot: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(**{ 'outlier.size':0, 'alpha':0.3 } ) + \ ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \ alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') else: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi', fill='locus') + ggplot2.geom_bar() + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{ 'axis.text.x':ggplot2.theme_text(angle = -90, hjust = 0), 'legend.position':'none' }) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') return plot
def compare_sum_barplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) #pdb.set_trace() frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''') if len(intervals) > 1: sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''agg_data''')) plot = gg_frame + \ ggplot2.aes_string( x = 'interval', y = 'pi', fill='factor(db)' ) + \ ggplot2.geom_bar(**{ 'position':'dodge', 'colour':'#767676', 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('net phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette="Blues") return plot
def main(): usage = 'usage: %prog [options] <mut1 file> <mut2 file>' parser = OptionParser(usage) parser.add_option('-m', dest='mut_norm', action='store_true', default=False, help='Normalize by # mutations (as opposed to sequenced bp) [Default: %default]') parser.add_option('-o', dest='output_pdf', default='mut_cmp.pdf', help='Output pdf file for heatmap [Default: %default]') parser.add_option('-r', dest='raw', action='store_true', default=False, help='Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: mut1_file = args[0] mut2_file = args[1] mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw) mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw) relative_mutation_profile = compute_relative_profile(mutation_profile1, seq_bp1, mutation_profile2, seq_bp2) print_table(relative_mutation_profile) # make plotting data structures nts = ['_','A','C','G','T'] nts1 = [] nts2 = [] rel = [] for nt1 in nts: for nt2 in nts: nts1.append(nt1) nts2.append(nt2) rel.append(relative_mutation_profile[(nt1,nt2)]) nts1_r = ro.StrVector(nts1) nts2_r = ro.StrVector(nts2) rel_r = ro.FloatVector(rel) df = ro.DataFrame({'nt1':nts1_r, 'nt2':nts2_r, 'rel':rel_r}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete(mut2_file, limits=nts) + \ ggplot2.scale_y_discrete(mut1_file, limits=nts) + \ ggplot2.scale_fill_gradient('Enrichment 1/2') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete('Read') + \ ggplot2.scale_y_discrete('Reference') + \ ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred') # save to file grdevices.pdf(file=options.output_pdf) gp.plot() grdevices.dev_off()
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile=None, height=120, fsize=12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({ 'feature': np.repeat(feat_names, nexpt), 'Classification': np.reshape(clf_coef, (clf_coef.size, )), 'Regression': np.reshape(reg_coef, (reg_coef.size, )) }) df2 = pd.melt(df, id_vars='feature', var_name='fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm') return df
def plot_cv_r2(pandas_df, outfile, fsize = 10, height = 120, max_width = 50, xlab = 'Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
def plot_cv_r2(pandas_df, outfile, fsize=10, height=120, max_width=50, xlab='Parameters'): """Makes boxplots of cross-validation results for different parameter settings""" ncv = len(set(list(pandas_df['title']))) r_df = com.convert_to_r_dataframe(pandas_df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \ ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize)}) w = max(5 * ncv, max_width) ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm')
def plot_thresh_distr(motif_names, thresh, out_dir, width = 350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif':motif_names, 'thresh':thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep = '\t', index = False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename = os.path.join(out_dir, 'count_thresh_bar' + ext), plot = gp, width = width, height = 300, unit = 'mm')
def compare_mean_boxplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) if len(intervals) > 1: sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(ggplot2.aes_string(fill = 'factor(db)'), **{ 'outlier.size':3, 'outlier.colour':'#767676', 'outlier.alpha':0.3, 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('mean phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette='Blues') return plot
def plot_thresh_distr(motif_names, thresh, out_dir, width=350): """Creates boxplots of the thresholds used with each feature.""" df = pd.DataFrame({'motif': motif_names, 'thresh': thresh}) df = df[df['thresh'] > 1] df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep='\t', index=False) fsize = 10 r_df = com.convert_to_r_dataframe(df) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize), 'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) for ext in ['.pdf', '.png']: ro.r.ggsave(filename=os.path.join(out_dir, 'count_thresh_bar' + ext), plot=gp, width=width, height=300, unit='mm')
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt), 'Classification':np.reshape(clf_coef, (clf_coef.size,)), 'Regression':np.reshape(reg_coef, (reg_coef.size,))}) df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm') return df
def main(): usage = 'usage: %prog [options] <mut1 file> <mut2 file>' parser = OptionParser(usage) parser.add_option( '-m', dest='mut_norm', action='store_true', default=False, help= 'Normalize by # mutations (as opposed to sequenced bp) [Default: %default]' ) parser.add_option('-o', dest='output_pdf', default='mut_cmp.pdf', help='Output pdf file for heatmap [Default: %default]') parser.add_option( '-r', dest='raw', action='store_true', default=False, help= 'Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: mut1_file = args[0] mut2_file = args[1] mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw) mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw) relative_mutation_profile = compute_relative_profile( mutation_profile1, seq_bp1, mutation_profile2, seq_bp2) print_table(relative_mutation_profile) # make plotting data structures nts = ['_', 'A', 'C', 'G', 'T'] nts1 = [] nts2 = [] rel = [] for nt1 in nts: for nt2 in nts: nts1.append(nt1) nts2.append(nt2) rel.append(relative_mutation_profile[(nt1, nt2)]) nts1_r = ro.StrVector(nts1) nts2_r = ro.StrVector(nts2) rel_r = ro.FloatVector(rel) df = ro.DataFrame({'nt1': nts1_r, 'nt2': nts2_r, 'rel': rel_r}) # plot ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete(mut2_file, limits=nts) + \ ggplot2.scale_y_discrete(mut1_file, limits=nts) + \ ggplot2.scale_fill_gradient('Enrichment 1/2') ''' gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \ ggplot2.geom_tile() + \ ggplot2.scale_x_discrete('Read') + \ ggplot2.scale_y_discrete('Reference') + \ ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred') # save to file grdevices.pdf(file=options.output_pdf) gp.plot() grdevices.dev_off()