def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1': [], 'fpkm2': []} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount + fpkm1, 2)) df['fpkm2'].append(math.log(options.pseudocount + fpkm2, 2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir, sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1':[], 'fpkm2':[]} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2)) df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound'] unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound'] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf' % os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r' % os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option( '-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok=True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) # TEMP: print bound genes # genes_out = open('%s_genes.txt' % options.output_pre, 'w') # for gene_id in rip_bound: # if rip_bound[gene_id]: # print >> genes_out, gene_id, rip_fold[gene_id] # genes_out.close() ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene': [], 'CLIP': [], 'RIP': []} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound' ] unbound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound' ] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([ df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i], False) ]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both - 1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both - 1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <diff1_file> <diff2_file>' parser = OptionParser(usage) parser.add_option('-s', dest='stat', default='test_stat') parser.add_option('-g', dest='genes_gtf', default=None) parser.add_option('-m', dest='min_fpkm', default=0, type='float') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: diff1_file = args[0] diff2_file = args[1] gtf_genes = None if options.genes_gtf: gtf_genes = gff.gtf_gene_set(options.genes_gtf) diff1_stats = cuffdiff.hash_stat(diff1_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes) diff2_stats = cuffdiff.hash_stat(diff2_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for diff_key in diff1_stats: sample1, sample2 = diff_key gene_stats1 = diff1_stats[diff_key] gene_sig1 = diff1_sig[diff_key] gene_stats2 = diff2_stats[diff_key] gene_sig2 = diff2_sig[diff_key] report_out = open( '%s/%s-%s_report.txt' % (options.out_dir, sample1, sample2), 'w') # compare numbers of genes quantified common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys()) print >> report_out, 'Genes quantified' print >> report_out, '%s\t%d' % (diff1_file, len(gene_stats1)) print >> report_out, '%s\t%d' % (diff2_file, len(gene_stats2)) print >> report_out, 'Common\t%d' % len(common_genes) print >> report_out, '' up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]]) up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]]) print >> report_out, 'Genes upregulated' print >> report_out, '%s\t%d' % (diff1_file, len(up1)) print >> report_out, '%s\t%d' % (diff2_file, len(up2)) print >> report_out, 'Common\t%d' % len(up1 & up2) print >> report_out, '' down1 = set( [gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]]) down2 = set( [gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]]) print >> report_out, 'Genes downregulated' print >> report_out, '%s\t%d' % (diff1_file, len(down1)) print >> report_out, '%s\t%d' % (diff2_file, len(down2)) print >> report_out, 'Common\t%d' % len(down1 & down2) print >> report_out, '' # scatter plot test stat df = {'diff1': [], 'diff2': []} for gene_id in common_genes: df['diff1'].append(gene_stats1[gene_id]) df['diff2'].append(gene_stats2[gene_id]) r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf], df_file='%s.df' % out_pdf[:-4]) # compute correlation cor, p = spearmanr(df['diff1'], df['diff2']) print >> report_out, 'Spearman correlation: %f (%f)' % (cor, p) cor, p = pearsonr(df['diff1'], df['diff2']) print >> report_out, 'Pearson correlation: %f (%f)' % (cor, p) report_out.close() # plot test_stat versus test_stat difference df = {'minus': [], 'avg': []} for gene_id in common_genes: df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id]) df['avg'].append(0.5 * gene_stats1[gene_id] + 0.5 * gene_stats2[gene_id]) r_script = '%s/diff_diff_ma.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf])
def main(): usage = 'usage: %prog [options] <diff1_file> <diff2_file>' parser = OptionParser(usage) parser.add_option('-s', dest='stat', default='test_stat') parser.add_option('-g', dest='genes_gtf', default=None) parser.add_option('-m', dest='min_fpkm', default=0, type='float') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: diff1_file = args[0] diff2_file = args[1] gtf_genes = None if options.genes_gtf: gtf_genes = gff.gtf_gene_set(options.genes_gtf) diff1_stats = cuffdiff.hash_stat(diff1_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes) diff2_stats = cuffdiff.hash_stat(diff2_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for diff_key in diff1_stats: sample1, sample2 = diff_key gene_stats1 = diff1_stats[diff_key] gene_sig1 = diff1_sig[diff_key] gene_stats2 = diff2_stats[diff_key] gene_sig2 = diff2_sig[diff_key] report_out = open('%s/%s-%s_report.txt' % (options.out_dir,sample1,sample2), 'w') # compare numbers of genes quantified common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys()) print >> report_out, 'Genes quantified' print >> report_out, '%s\t%d' % (diff1_file,len(gene_stats1)) print >> report_out, '%s\t%d' % (diff2_file,len(gene_stats2)) print >> report_out, 'Common\t%d' % len(common_genes) print >> report_out, '' up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]]) up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]]) print >> report_out, 'Genes upregulated' print >> report_out, '%s\t%d' % (diff1_file,len(up1)) print >> report_out, '%s\t%d' % (diff2_file,len(up2)) print >> report_out, 'Common\t%d' % len(up1 & up2) print >> report_out, '' down1 = set([gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]]) down2 = set([gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]]) print >> report_out, 'Genes downregulated' print >> report_out, '%s\t%d' % (diff1_file,len(down1)) print >> report_out, '%s\t%d' % (diff2_file,len(down2)) print >> report_out, 'Common\t%d' % len(down1 & down2) print >> report_out, '' # scatter plot test stat df = {'diff1':[], 'diff2':[]} for gene_id in common_genes: df['diff1'].append(gene_stats1[gene_id]) df['diff2'].append(gene_stats2[gene_id]) r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf], df_file='%s.df'%out_pdf[:-4]) # compute correlation cor, p = spearmanr(df['diff1'], df['diff2']) print >> report_out, 'Spearman correlation: %f (%f)' % (cor,p) cor, p = pearsonr(df['diff1'], df['diff2']) print >> report_out, 'Pearson correlation: %f (%f)' % (cor,p) report_out.close() # plot test_stat versus test_stat difference df = {'minus':[], 'avg':[]} for gene_id in common_genes: df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id]) df['avg'].append(0.5*gene_stats1[gene_id] + 0.5*gene_stats2[gene_id]) r_script = '%s/diff_diff_ma.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf])