def main(): usage = 'usage: %prog [options] <gene/transcript id>' parser = OptionParser(usage) parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]') parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]') parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]') (options,args) = parser.parse_args() if options.transcript_expr: cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: trans_ids = set() for line in open(options.lnc_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['gene_id'] == args[0]: trans_ids.add(kv['transcript_id']) else: trans_ids = [args[0]] for trans_id in trans_ids: print '%s:' % trans_id cuff.gene_expr_print(trans_id) else: cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: gene_id = args[0] else: t2g = gff.t2g(options.lnc_gtf) gene_id = t2g[args[0]] cuff.gene_expr_print(gene_id)
def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1': [], 'fpkm2': []} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount + fpkm1, 2)) df['fpkm2'].append(math.log(options.pseudocount + fpkm2, 2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir, sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def main(): usage = 'usage: %prog [options] <gtf file> <cell type>' parser = OptionParser(usage) parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and cell type') else: gtf_file = args[0] cell_type = args[1] # get expression data cuff = cufflinks.fpkm_tracking() # find cell type experiment index cell_indexes = [i for i in range(len(cuff.experiments)) if cuff.experiments[i]==cell_type] if len(cell_indexes) == 0: parser.error('Cell type %s does not match any quantified experiments' % cell_type) else: cell_i = cell_indexes[0] # parser gtf file for line in open(gtf_file): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['gene_id'] expr_vec = cuff.gene_expr(gene_id) if expr_vec[cell_i] > options.expr_t: print line,
def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1':[], 'fpkm2':[]} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2)) df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def cuff_fpkm(fpkm_file, pseudocount): cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) gene_fpkm = {} for gene_id in cuff.genes: gene_fpkm[gene_id] = stats.mean([math.log(pseudocount+e,2) for e in cuff.gene_expr(gene_id, not_found=0, fail=0)]) return gene_fpkm
def cuff_fpkm(fpkm_file, pseudocount): cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) gene_fpkm = {} for gene_id in cuff.genes: gene_fpkm[gene_id] = stats.mean([ math.log(pseudocount + e, 2) for e in cuff.gene_expr(gene_id, not_found=0, fail=0) ]) return gene_fpkm
def main(): usage = "usage: %prog [options] <fpkm_tracking>" parser = OptionParser(usage) parser.add_option("-d", dest="diff_file", help="Limit to significantly differentially expressed genes") parser.add_option("-g", dest="gtf", help="GTF file of genes to display") parser.add_option("-m", dest="min_fpkm", default=0.125, help="Minimum FPKM (for logs) [Default: %default]") parser.add_option("-o", dest="out_pdf", default="cuff_heat.pdf", help="Output PDF [Default: %default]") parser.add_option("-s", dest="sample", default=1000, help="Sample genes rather than use all [Default: %default]") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide fpkm_tracking") else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split("\t") all_genes.add(gff.gtf_kv(a[8])["gene_id"]) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {"Gene": [], "FPKM": [], "Sample": []} for gene_id in display_genes: ge = cuff.gene_expr(gene_id) if not math.isnan(ge[0]): for i in range(len(cuff.experiments)): df["Gene"].append(gene_id) df["Sample"].append(cuff.experiments[i]) df["FPKM"].append(math.log(ge[i] + options.min_fpkm, 2)) # plot ggplot.plot("%s/cuff_heat.r" % os.environ["RDIR"], df, [options.out_pdf])
def main(): usage = 'usage: %prog [options] <gtf file> <fpkm tracking>' parser = OptionParser(usage) #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: gtf_file = args[0] fpkm_tracking_file = args[1] # get genes genes = set() for line in open(gtf_file): a = line.split('\t') genes.add(gff.gtf_kv(a[8])['gene_id']) # get expression cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) log_fpkms = [] for gene_id in genes: max_fpkm = max(cuff.gene_expr(gene_id)) if max_fpkm > 0: log_fpkms.append(math.log(max_fpkm,2)) # construct R data objects fpkms_r = ro.FloatVector(log_fpkms) df = ro.DataFrame({'fpkm':fpkms_r}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='fpkm') + \ ggplot2.geom_histogram(binwidth=0.2) # save to file gtf_pre = os.path.splitext(gtf_file)[0] grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre) gp.plot() grdevices.dev_off()
def main(): usage = "usage: %prog [options] <gtf file> <fpkm tracking>" parser = OptionParser(usage) # parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: gtf_file = args[0] fpkm_tracking_file = args[1] # get genes genes = set() for line in open(gtf_file): a = line.split("\t") genes.add(gff.gtf_kv(a[8])["gene_id"]) # get expression cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) log_fpkms = [] for gene_id in genes: max_fpkm = max(cuff.gene_expr(gene_id)) if max_fpkm > 0: log_fpkms.append(math.log(max_fpkm, 2)) # construct R data objects fpkms_r = ro.FloatVector(log_fpkms) df = ro.DataFrame({"fpkm": fpkms_r}) # construct plot gp = ggplot2.ggplot(df) + ggplot2.aes_string(x="fpkm") + ggplot2.geom_histogram(binwidth=0.2) # save to file gtf_pre = os.path.splitext(gtf_file)[0] grdevices.pdf(file="%s_fpkmhist.pdf" % gtf_pre) gp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <gtf file> <cell type>' parser = OptionParser(usage) parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and cell type') else: gtf_file = args[0] cell_type = args[1] # get expression data cuff = cufflinks.fpkm_tracking() # find cell type experiment index cell_indexes = [ i for i in range(len(cuff.experiments)) if cuff.experiments[i] == cell_type ] if len(cell_indexes) == 0: parser.error('Cell type %s does not match any quantified experiments' % cell_type) else: cell_i = cell_indexes[0] # parser gtf file for line in open(gtf_file): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['gene_id'] expr_vec = cuff.gene_expr(gene_id) if expr_vec[cell_i] > options.expr_t: print line,
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform') parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+','-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols))
def main(): usage = 'usage: %prog [options] <fpkm_tracking>' parser = OptionParser(usage) parser.add_option('-a', dest='max_fpkm', type='float', help='Maxium log2 FPKM to plot [Default: %d]') parser.add_option('-d', dest='diff_file', help='Limit to significantly differentially expressed genes') parser.add_option('-g', dest='gtf', help='GTF file of genes to display') parser.add_option('-m', dest='min_fpkm', default=0, type='float', help='Minimum FPKM [Default: %default]') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cuff_heat.pdf', help='Output PDF [Default: %default]') parser.add_option('-s', dest='sample', default=1000, type='int', help='Sample genes rather than use all [Default: %default]') parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide fpkm_tracking') else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split('\t') all_genes.add(gff.gtf_kv(a[8])['gene_id']) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes else: # at least limit to clean genes clean_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id) clean = True for i in range(len(ge)): if math.isnan(ge[i]): clean = False break if clean: clean_genes.add(gene_id) all_genes &= clean_genes if options.min_fpkm > 0: expressed_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) if max(ge) > options.min_fpkm: expressed_genes.add(gene_id) all_genes &= expressed_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {'Gene':[], 'FPKM':[], 'Sample':[]} for gene_id in display_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) for i in range(len(cuff.experiments)): df['Gene'].append(gene_id) df['Sample'].append(cuff.experiments[i]) if options.uppercase: df['Sample'][-1] = df['Sample'][-1].upper() logfpkm = np.log2(ge[i]+options.pseudocount) if options.max_fpkm: logfpkm = min(options.max_fpkm, logfpkm) df['FPKM'].append(logfpkm) # plot out_df = '%s.df' % options.out_pdf[:-4] ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'], df, [options.out_pdf], df_file=out_df)
def main(): usage = 'usage: %prog [options] <fpkm_tracking>' parser = OptionParser(usage) parser.add_option('-a', dest='max_fpkm', type='float', help='Maxium log2 FPKM to plot [Default: %d]') parser.add_option( '-d', dest='diff_file', help='Limit to significantly differentially expressed genes') parser.add_option('-g', dest='gtf', help='GTF file of genes to display') parser.add_option('-m', dest='min_fpkm', default=0, type='float', help='Minimum FPKM [Default: %default]') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cuff_heat.pdf', help='Output PDF [Default: %default]') parser.add_option( '-s', dest='sample', default=1000, type='int', help='Sample genes rather than use all [Default: %default]') parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide fpkm_tracking') else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split('\t') all_genes.add(gff.gtf_kv(a[8])['gene_id']) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes else: # at least limit to clean genes clean_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id) clean = True for i in range(len(ge)): if math.isnan(ge[i]): clean = False break if clean: clean_genes.add(gene_id) all_genes &= clean_genes if options.min_fpkm > 0: expressed_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) if max(ge) > options.min_fpkm: expressed_genes.add(gene_id) all_genes &= expressed_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {'Gene': [], 'FPKM': [], 'Sample': []} for gene_id in display_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) for i in range(len(cuff.experiments)): df['Gene'].append(gene_id) df['Sample'].append(cuff.experiments[i]) if options.uppercase: df['Sample'][-1] = df['Sample'][-1].upper() logfpkm = np.log2(ge[i] + options.pseudocount) if options.max_fpkm: logfpkm = min(options.max_fpkm, logfpkm) df['FPKM'].append(logfpkm) # plot out_df = '%s.df' % options.out_pdf[:-4] ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'], df, [options.out_pdf], df_file=out_df)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option( '-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform' ) parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+', '-']: print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid) ]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id) ]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.upstream), str(tss + options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols) else: if tss - options.downstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.downstream), str(tss + options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols)