def map_genes(gtf_file, fpkm_file, pseudocount=0.125, all_isoforms=False, random_zeros=False): # get expression data if fpkm_file[-5:] == '.diff': transcript_fpkm = diff_fpkm(fpkm_file, pseudocount) else: transcript_fpkm = cuff_fpkm(fpkm_file, pseudocount) # get genes if all_isoforms: g2t = gff.g2t(gtf_file) else: g2t = {} for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_type'] not in ['intron', 'prerna', 'nonsense_mediated_decay', 'retained_intron', 'non_stop_decay']: g2t.setdefault(kv['gene_id'],set()).add(kv['transcript_id']) # map gene_id's to max expression isoform gene_max_iso = {} min_fpkm = math.log(pseudocount, 2) for gid in g2t: max_fpkm_tid = None max_fpkm = min_fpkm for tid in g2t[gid]: if transcript_fpkm.get(tid,min_fpkm) > max_fpkm: max_fpkm_tid = tid max_fpkm = transcript_fpkm[tid] gene_max_iso[gid] = max_fpkm_tid # choose isoforms for None if random_zeros: for gid in g2t: if gene_max_iso[gid] == None: gene_max_iso[gid] = random.choice(g2t[gid]) return gene_max_iso
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform') parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+','-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols))
def main(): usage = 'usage: %prog [options] <gtf> <fasta>' parser = OptionParser(usage) parser.add_option('-b', dest='bam_length', help='Obtain read length via sampling a distribution from a BAM file [Default: %default]') parser.add_option('-e', dest='error_rate', type='float', default=0, help='Error rate (uniform on reads) [Default: %default]') parser.add_option('-f', dest='fpkm_file', help='Cufflinks .fpkm_tracking file to use for FPKMs [Default: %default]') parser.add_option('-l', dest='read_length', type='int', default=30, help='Read length [Default: %default]') parser.add_option('-n', dest='num_reads', type='int', default=100000, help='Number of reads [Default: %default]') parser.add_option('-o', dest='output_prefix', default='reads', help='Output files prefix [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide GTF file and fasta file') else: gtf_file = args[0] fasta_file = args[1] if options.bam_length: read_length_distribution = bam_length_distribution(options.bam_length) else: read_length_distribution = {options.read_length:1} # read GTF gene_id to transcript_id's mapping g2t = gff.g2t(gtf_file) # get transcript lengths transcript_lengths = {} for line in open(gtf_file): a = line.split('\t') if a[2] == 'exon': transcript_id = gff.gtf_kv(a[8])['transcript_id'] transcript_lengths[transcript_id] = transcript_lengths.get(transcript_id,0) + int(a[4])-int(a[3])+1 if options.fpkm_file: transcript_copies = {} fpkm_in = open(options.fpkm_file) line = fpkm_in.readline() for line in fpkm_in: a = line.split('\t') transcript_copies[a[0]] = float(a[9]) fpkm_in.close() if sum(transcript_copies.values()) == 0: print >> sys.stderr, 'FPKM file shows no expression. Exiting.' exit(1) else: # sample gene copies gene_copies_raw = lognorm.rvs(1,size=len(g2t)) gene_copies_raw_sum = sum(gene_copies_raw) gene_copies = dict(zip(g2t.keys(), [gcr/gene_copies_raw_sum for gcr in gene_copies_raw])) # sample transcript copies transcript_copies = {} for gene_id in g2t: relative_copies = dict(zip(g2t[gene_id], lognorm.rvs(1,size=len(g2t[gene_id])))) relative_sum = sum(relative_copies.values()) for transcript_id in g2t[gene_id]: transcript_copies[transcript_id] = gene_copies[gene_id]*relative_copies[transcript_id]/relative_sum # determine transcript probabilities as a function of copy and length transcript_weights = {} for transcript_id in transcript_copies: if transcript_lengths[transcript_id] >= min(read_length_distribution.keys()): weight = 0 for read_length in read_length_distribution: weight += read_length_distribution[read_length]*transcript_copies[transcript_id]*(transcript_lengths[transcript_id]-read_length+1) if weight > 0: transcript_weights[transcript_id] = weight weights_sum = sum(transcript_weights.values()) transcript_probs = dict([(tid,transcript_weights[tid]/weights_sum) for tid in transcript_weights]) # open fasta file fasta = pysam.Fastafile(fasta_file) # open output files fastq_out = open('%s.fastq' % options.output_prefix, 'w') gff_out = open('%s_txome.gff' % options.output_prefix, 'w') # for each transcript read_index = 1 for transcript_id in transcript_probs: expected_reads = transcript_probs[transcript_id]*options.num_reads if expected_reads == 0: sampled_reads = 0 else: sampled_reads = poisson.rvs(expected_reads) for s in range(sampled_reads): read_length = sample_read_length(read_length_distribution) if transcript_lengths[transcript_id] > read_length: pos = random.randint(0, transcript_lengths[transcript_id]-read_length) seq = fasta.fetch(transcript_id, pos, pos+read_length).upper() if seq: eseq = inject_errors(seq, options.error_rate) print >> fastq_out, '@read%d\n%s\n+\n%s' % (read_index,eseq,'I'*read_length) print >> gff_out, '\t'.join([transcript_id, 'sim', 'read', str(pos+1), str(pos+read_length), '.', '+', '.', 'read%d'%read_index]) read_index += 1 else: print >> sys.stderr, 'Missing fasta sequence %s:%d-%d' % (transcript_id,pos,(pos+read_length)) fastq_out.close() gff_out.close() # map back to genome subprocess.call('tgff_cgff.py -c %s %s_txome.gff > %s_genome.gff' % (gtf_file, options.output_prefix, options.output_prefix), shell=True)
def main(): usage = 'usage: %prog [options] <clip_bam> <ref_gtf>' parser = OptionParser(usage) # IO options parser.add_option('-c', dest='control_bam', help='Control BAM file') parser.add_option('-o', dest='out_dir', default='peaks', help='Output directory [Default: %default]') # peak calling options parser.add_option('-w', dest='window_size', type='int', default=50, help='Window size for scan statistic [Default: %default]') parser.add_option('-p', dest='p_val', type='float', default=.01, help='P-value required of window scan statistic tests [Default: %default]') # cufflinks options parser.add_option('--cuff_done', dest='cuff_done', action='store_true', default=False, help='A cufflinks run to estimate the model parameters is already done [Default: %default]') parser.add_option('-t', dest='threads', type='int', default=2, help='Number of threads to use [Default: %default]') # debug options parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]') parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only') parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: clip_bam = args[0] ref_gtf = args[1] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ############################################ # parameterize ############################################ if options.verbose: print >> sys.stderr, 'Estimating gene abundances...' if options.control_bam: # make a new gtf w/ unspliced RNAs update_ref_gtf = prerna_gtf(ref_gtf, options.out_dir) # run Cufflinks on new gtf file and control BAM if not options.cuff_done: subprocess.call('cufflinks -o %s -p %d -G %s %s' % (options.out_dir, options.threads, update_ref_gtf, options.control_bam), shell=True) else: # make a new gtf file of only loci-spanning RNAs update_ref_gtf = span_gtf(ref_gtf, options.out_dir) # run Cufflinks on new gtf file and CLIP BAM if not options.cuff_done: subprocess.call('cufflinks -o %s -p %d -G %s %s' % (options.out_dir, options.threads, update_ref_gtf, clip_bam), shell=True) # store transcripts transcripts = read_genes(update_ref_gtf, key_id='transcript_id') g2t = gff.g2t(update_ref_gtf) # set junctions set_transcript_junctions(transcripts) # set "exon" FPKMs set_transcript_fpkms(transcripts, options.out_dir, options.verbose) if options.verbose: print >> sys.stderr, 'Computing global statistics...' # count transcriptome CLIP reads (overestimates small RNA single ended reads by counting antisense) subprocess.call('intersectBed -abam %s -b %s/transcripts.gtf > %s/transcripts.bam' % (clip_bam, options.out_dir, options.out_dir), shell=True) total_reads = count_reads('%s/transcripts.bam' % options.out_dir) # compute # of tests we will perform txome_size = transcriptome_size(transcripts, options.window_size) ############################################ # process genes ############################################ # TODO: Can I convert to using transcripts.bam here? Does it affect performance given an indexing? # index subprocess.call('samtools index %s' % clip_bam, shell=True) # open clip-seq bam clip_in = pysam.Samfile(clip_bam, 'rb') # open peak output gff peaks_out = open('%s/peaks.gff' % options.out_dir, 'w') peak_id = 1 # open window output windows_out = None if options.print_windows: windows_out = open('%s/window_stats.txt' % options.out_dir, 'w') # for each gene if options.gene_only: gene_ids = [options.gene_only] else: gene_ids = g2t.keys() for gene_id in gene_ids: if options.verbose: print >> sys.stderr, 'Processing %s...' % gene_id # make a more focused transcript hash for this gene gene_transcripts = {} for tid in g2t[gene_id]: gene_transcripts[tid] = transcripts[tid] # obtain basic gene attributes (gchrom, gstrand, gstart, gend) = gene_attrs(gene_transcripts) if options.verbose: print >> sys.stderr, '\tFetching alignments...' # choose a single event position and weight the reads read_pos_weights = position_reads(clip_in, gchrom, gstart, gend, gstrand) # find splice junctions #junctions = map_splice_junctions(tx) if options.verbose: print >> sys.stderr, '\tCounting and computing in windows...' # count reads and compute p-values in windows window_stats = count_windows(clip_in, options.window_size, read_pos_weights, gene_transcripts, gstart, gend, total_reads, txome_size, windows_out) if options.verbose: print >> sys.stderr, '\tRefining peaks...' # post-process windows to peaks peaks = windows2peaks(read_pos_weights, gene_transcripts, gstart, window_stats, options.window_size, options.p_val, total_reads, txome_size) # output peaks for pstart, pend, pcount, ppval in peaks: if ppval > 0: peak_score = int(2000/math.pi*math.atan(-math.log(ppval,1000))) else: peak_score = 1000 cols = [gchrom, 'clip_peaks', 'peak', str(pstart), str(pend), str(peak_score), gstrand, '.', 'id "PEAK%d"; gene_id "%s"; count "%.1f"; p "%.2e"' % (peak_id,gene_id,pcount,ppval)] print >> peaks_out, '\t'.join(cols) peak_id += 1 clip_in.close() peaks_out.close()
def main(): usage = 'usage: %prog [options] <ref gtf> <merged gtf>' parser = OptionParser(usage) (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: ref_gtf = args[0] merged_gtf = args[1] # get mappings ref_t2g = gff.t2g(ref_gtf) merged_t2g = gff.t2g(merged_gtf) merged_g2t = gff.g2t(merged_gtf) # hash gene_name's by tid ref_gid_names = {} for line in open(ref_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if 'gene_name' in kv: ref_gid_names[kv['gene_id']] = kv['gene_name'] # hash merged lines by tid merged_tid_lines = {} for line in open(merged_gtf): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] merged_tid_lines.setdefault(tid,[]).append(line) # intialize orphan gene_id orphan_num = 1 for mgene_id in merged_g2t: # count reference genes ref_genes = set() for tid in merged_g2t[mgene_id]: if tid in ref_t2g: ref_genes.add(ref_t2g[tid]) # if no known genes, leave it alone if len(ref_genes) == 0: for tid in merged_g2t[mgene_id]: print ''.join(merged_tid_lines[tid]), # if known gene, set gene_id to it elif len(ref_genes) == 1: new_gene_id = list(ref_genes)[0] for tid in merged_g2t[mgene_id]: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = new_gene_id if new_gene_id in ref_gid_names: kv['gene_name'] = ref_gid_names[new_gene_id] a[8] = gff.kv_gtf(kv) print '\t'.join(a) # if two known genes were combined, fix it elif len(ref_genes) > 1: # compute transcript overlaps and build overlap graph tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines) # map each new transcript to the ref gene_id's overlapped tid_ref_genes = {} for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 not in ref_t2g: tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1]) elif tid1 not in ref_t2g and tid2 in ref_t2g: tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2]) # remove new transcripts overlapping multiple ref gene_id's for tid in tid_ref_genes: if len(tid_ref_genes[tid]) > 1: print >> sys.stderr, 'Removing %s' % tid tid_overlap_graph.remove_node(tid) # remove edges connecting separate reference genes for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]: tid_overlap_graph.remove_edge(tid1,tid2) # map to new gene_id's; missing means eliminate transcript tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g) for tid in merged_g2t[mgene_id]: if tid in tid_new_gid: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = tid_new_gid[tid] if tid_new_gid[tid] in ref_gid_names: kv['gene_name'] = ref_gid_names[tid_new_gid[tid]] a[8] = gff.kv_gtf(kv) print '\t'.join(a)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option( '-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform' ) parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+', '-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid) ]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id) ]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.upstream), str(tss + options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.downstream), str(tss + options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print('\t'.join(cols))