def main(): usage = 'usage: %prog [options] <gtf_file>' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() gtf_file = args[0] genes = {} for line in open(gtf_file): a = line.split() gene_id = a[9][1:-2] genes.setdefault(gene_id, []).append(line) for gene_id in genes: start = min([int(line.split()[3]) for line in genes[gene_id]]) end = max([int(line.split()[4]) for line in genes[gene_id]]) a = genes[gene_id][0].split('\t') kv = gff.gtf_kv(a[8]) succinct_kv = {'gene_id': kv['gene_id']} succinct_kv['transcript_id'] = ','.join( list(set([line.split()[11][1:-2] for line in genes[gene_id]]))) d = [ a[0], 'gtf', 'gene', str(start), str(end), '.', a[6], '.', gff.kv_gtf(succinct_kv) ] print '\t'.join(d)
def main(): usage = 'usage: %prog [options] <rm out>' parser = OptionParser(usage) #parser.add_option() (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide RepeatMasker .out file') else: if args[0][-2:] == 'gz': rm_in = gzip.open(args[0]) else: rm_in = open(args[0]) for i in range(4): line = rm_in.readline() while line: a = line.split() if a[8] == '+': strand = '+' else: strand = '-' cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]})) print '\t'.join(cols) line = rm_in.readline()
def main(): usage = "usage: %prog [options] <gtf_file>" parser = OptionParser(usage) # parser.add_option() (options, args) = parser.parse_args() gtf_file = args[0] genes = {} for line in open(gtf_file): a = line.split() gene_id = a[9][1:-2] genes.setdefault(gene_id, []).append(line) for gene_id in genes: start = min([int(line.split()[3]) for line in genes[gene_id]]) end = max([int(line.split()[4]) for line in genes[gene_id]]) a = genes[gene_id][0].split("\t") kv = gff.gtf_kv(a[8]) succinct_kv = {"gene_id": kv["gene_id"]} succinct_kv["transcript_id"] = ",".join(list(set([line.split()[11][1:-2] for line in genes[gene_id]]))) d = [a[0], "gtf", "gene", str(start), str(end), ".", a[6], ".", gff.kv_gtf(succinct_kv)] print "\t".join(d)
def gff_line(a): strand = a[8] if strand == 'C': strand = '-' cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]})) return '\t'.join(cols)
def main(): usage = 'usage: %prog [options] <rm out>' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide RepeatMasker .out file') else: if args[0][-2:] == 'gz': rm_in = gzip.open(args[0]) else: rm_in = open(args[0]) for i in range(4): line = rm_in.readline() while line: a = line.split() if a[8] == '+': strand = '+' else: strand = '-' cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({ 'repeat': a[9], 'family': a[10] })) print '\t'.join(cols) line = rm_in.readline()
def span_gtf(ref_gtf, out_dir): # obtain gene regions transcripts = read_genes(ref_gtf, key_id='transcript_id') gene_regions = get_gene_regions(transcripts) # print span_ref_gtf = '%s/span.gtf' % out_dir span_ref_open = open(span_ref_gtf, 'w') for gid in gene_regions: g = gene_regions[gid] cols = [g[0], 'clip_peaks', 'exon', str(g[1]), str(g[2]), '.', g[3], '.', gff.kv_gtf({'gene_id':gid, 'transcript_id':gid})] print >> span_ref_open, '\t'.join(cols) span_ref_open.close() return span_ref_gtf
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t): # grab promoters for prom in promoters: if prom.strand == '+': prom_seq = seq[prom.start:prom.start + promoter_length] else: prom_seq = dna.rc(seq[prom.start:prom.start + promoter_length]) if acgt_pct(prom_seq) > acgt_t: print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq) gff_dat = [ chrom, '.', 'promoter', str(prom.start + 1), str(prom.start + promoter_length + 1 - 1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv) ] print >> out_gff, '\t'.join(gff_dat)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform') parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+','-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols))
def main(): usage = 'usage: %prog [options] <gencode_gtf>' parser = OptionParser(usage) parser.add_option('-l', dest='min_transcript_length', default=50, type='int') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide GENCODE GTF') else: full_gtf = args[0] ############################################################ # remove small rna (and non-exon) ############################################################ small_rnas = set(['miRNA','misc_RNA','snRNA','snoRNA','rRNA','Mt_rRNA']) sansrna_gtf_fd, sansrna_gtf_file = tempfile.mkstemp() sansrna_gtf_out = open(sansrna_gtf_file, 'w') # ignore header full_gtf_in = open(full_gtf) line = full_gtf_in.readline() while line[:2] == '##': line = full_gtf_in.readline() while line: a = line.split('\t') if a[2] == 'exon': kv = gff.gtf_kv(a[8]) if kv['transcript_type'] not in small_rnas: print >> sansrna_gtf_out, line, line = full_gtf_in.readline() sansrna_gtf_out.close() ############################################################ # remove tiny (unestimatable) transcripts ############################################################ transcript_lengths = {} for line in open(sansrna_gtf_file): a = line.split('\t') if a[2] == 'exon': transcript_id = gff.gtf_kv(a[8])['transcript_id'] transcript_lengths[transcript_id] = transcript_lengths.get(transcript_id,0) + int(a[4])-int(a[3])+1 sanstiny_gtf_fd, sanstiny_gtf_file = tempfile.mkstemp() sanstiny_gtf_out = open(sanstiny_gtf_file, 'w') for line in open(sansrna_gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if transcript_lengths[kv['transcript_id']] >= options.min_transcript_length: print >> sanstiny_gtf_out, line, sanstiny_gtf_out.close() ############################################################ # run cuffcompare to get id's ############################################################ subprocess.call('cuffcompare -s $HG19/sequence/hg19.fa -CG -r %s %s' % (sanstiny_gtf_file, sanstiny_gtf_file), shell=True) # hash id's by oId tss_id = {} p_id = {} for line in open('cuffcmp.combined.gtf'): a = line.split('\t') kv = gff.gtf_kv(a[8]) tss_id[kv['oId']] = kv['tss_id'] if 'p_id' in kv: p_id[kv['oId']] = kv['p_id'] ############################################################ # add id's and print ############################################################ unsorted_gtf_fd, unsorted_gtf_file = tempfile.mkstemp() unsorted_gtf_out = open(unsorted_gtf_file, 'w') for line in open(sanstiny_gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['tss_id'] = tss_id[kv['transcript_id']] if kv['transcript_id'] in p_id: kv['p_id'] = p_id[kv['transcript_id']] a[8] = gff.kv_gtf(kv) print >> unsorted_gtf_out, '\t'.join(a) unsorted_gtf_out.close() ############################################################ # might as well sort it! ############################################################ subprocess.call('sortBed -i %s' % unsorted_gtf_file, shell=True) ############################################################ # clean ############################################################ # temp os.close(sansrna_gtf_fd) os.remove(sansrna_gtf_file) os.close(sanstiny_gtf_fd) os.remove(sanstiny_gtf_file) os.close(unsorted_gtf_fd) os.remove(unsorted_gtf_file) # cuffcompare os.remove('cuffcmp.tracking') os.remove('cuffcmp.loci') os.remove('cuffcmp.combined.gtf') os.remove('cuffcmp.stats')
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t): # grab promoters for prom in promoters: if prom.strand == '+': prom_seq = seq[prom.start:prom.start+promoter_length] else: prom_seq = dna.rc(seq[prom.start:prom.start+promoter_length]) if acgt_pct(prom_seq) > acgt_t: print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq) gff_dat = [chrom, '.', 'promoter', str(prom.start+1), str(prom.start+promoter_length+1-1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv)] print >> out_gff, '\t'.join(gff_dat)
def main(): usage = 'usage: %prog [options] <gtf file>' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() if len(args) != 1: parser.error(usage) else: gtf_file = args[0] ############################################ # fix multi-chromosome genes ############################################ # find multi-chromosome genes tx_chrs = {} for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_id'].startswith('NM_'): tx_chrs.setdefault(kv['transcript_id'], set()).add(a[0]) multi_genes = set([tid for tid in tx_chrs if len(tx_chrs[tid]) > 1]) # revise gtf tx_gene = {} gtf_out = open('tmp.gtf', 'w') for line in open(gtf_file): a = line.split('\t') a[-1] = a[-1].rstrip() kv = gff.gtf_kv(a[8]) # if multi-chrom gene, supplement id's if kv['transcript_id'] in multi_genes: kv['transcript_id'] += 'c%s' % a[0][3:] a[8] = gff.kv_gtf(kv) # map trans to gene (forget the actual gene id's; they don't consider "_dup") tx_gene[kv['transcript_id']] = kv['transcript_id'] # print new line print >> gtf_out, '\t'.join(a) gtf_out.close() ############################################ # merge transcripts into genes ############################################ # intersect and build overlapping transcript graph G = networkx.Graph() p = subprocess.Popen('intersectBed -f 0.2 -r -wo -s -a tmp.gtf -b tmp.gtf', shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') tid1 = gff.gtf_kv(a[8])['transcript_id'] tid2 = gff.gtf_kv(a[17])['transcript_id'] G.add_edge(tid1, tid2) line = p.stdout.readline() p.communicate() # combine connected components as genes for component in networkx.algorithms.components.connected.connected_components( G): comp_gene = 'G' + tx_gene[component[0]] for tid in component: tx_gene[tid] = comp_gene for tid in tx_gene: if tx_gene[tid][0] != 'G': tx_gene[tid] = 'G' + tx_gene[tid] ############################################ # output ############################################ # print for line in open('tmp.gtf'): a = line.split('\t') a[-1] = a[-1].rstrip() kv = gff.gtf_kv(a[8]) kv['gene_id'] = tx_gene[kv['transcript_id']] a[8] = gff.kv_gtf(kv) print '\t'.join(a) # clean os.remove('tmp.gtf')
def main(): usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>' parser = OptionParser(usage) parser.add_option('-m', dest='max_genes_overlapped', default=None, type='int', help='Don\'t include isoforms that overlap more than this many genes [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide reference GTF and output prerna GTF') else: ref_gtf = args[0] prerna_gtf = args[1] # read transcripts for filtering/processing transcripts = gff.read_genes(ref_gtf, key_id='transcript_id') # add unspliced single exon transcripts to hash prerna_hash = set() for tid in transcripts: tx = transcripts[tid] if len(tx.exons) == 1: tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand) prerna_hash.add(tx_key) # process transcripts prerna_out = open(prerna_gtf, 'w') prerna_index = 0 for tid in transcripts: tx = transcripts[tid] pre_start = tx.exons[0].start pre_end = tx.exons[-1].end pre_key = (tx.chrom, pre_start, pre_end, tx.strand) # print exons for i in range(len(tx.exons)): cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv)) print >> prerna_out, '\t'.join(cols) # print prernas if not pre_key in prerna_hash: prerna_hash.add(pre_key) pre_kv = copy.copy(tx.kv) pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index pre_kv['transcript_type'] = 'prerna' prerna_index += 1 cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv)) print >> prerna_out, '\t'.join(cols) prerna_out.close() if options.max_genes_overlapped != None: # intersect with self and compute overlap sets p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE) tx_overlaps = {} for line in p.stdout: a = line.split('\t') kv1 = gff.gtf_kv(a[8]) tid1 = kv1['transcript_id'] if tid1.startswith('PRERNA'): gid1 = kv1['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: tx_overlaps.setdefault(tid1,set()).add(gid2) p.communicate() # filter into a temp gtf prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp() prerna_out = open(prerna_tmp_file, 'w') for line in open(prerna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] if len(tx_overlaps.get(tid,[])) <= options.max_genes_overlapped: print >> prerna_out, line, prerna_out.close() # rewrite temp to the final output prerna_out = open(prerna_gtf, 'w') for line in open(prerna_tmp_file): print >> prerna_out, line, prerna_out.close() os.close(prerna_tmp_fd) os.remove(prerna_tmp_file)
def main(): usage = 'usage: %prog [options] <gencode_gtf>' parser = OptionParser(usage) parser.add_option('-l', dest='min_transcript_length', default=50, type='int') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide GENCODE GTF') else: full_gtf = args[0] ############################################################ # remove small rna (and non-exon) ############################################################ small_rnas = set( ['miRNA', 'misc_RNA', 'snRNA', 'snoRNA', 'rRNA', 'Mt_rRNA']) sansrna_gtf_fd, sansrna_gtf_file = tempfile.mkstemp() sansrna_gtf_out = open(sansrna_gtf_file, 'w') # ignore header full_gtf_in = open(full_gtf) line = full_gtf_in.readline() while line[:2] == '##': line = full_gtf_in.readline() while line: a = line.split('\t') if a[2] == 'exon': kv = gff.gtf_kv(a[8]) if kv['transcript_type'] not in small_rnas: print >> sansrna_gtf_out, line, line = full_gtf_in.readline() sansrna_gtf_out.close() ############################################################ # remove tiny (unestimatable) transcripts ############################################################ transcript_lengths = {} for line in open(sansrna_gtf_file): a = line.split('\t') if a[2] == 'exon': transcript_id = gff.gtf_kv(a[8])['transcript_id'] transcript_lengths[transcript_id] = transcript_lengths.get( transcript_id, 0) + int(a[4]) - int(a[3]) + 1 sanstiny_gtf_fd, sanstiny_gtf_file = tempfile.mkstemp() sanstiny_gtf_out = open(sanstiny_gtf_file, 'w') for line in open(sansrna_gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if transcript_lengths[ kv['transcript_id']] >= options.min_transcript_length: print >> sanstiny_gtf_out, line, sanstiny_gtf_out.close() ############################################################ # run cuffcompare to get id's ############################################################ subprocess.call('cuffcompare -s $HG19/sequence/hg19.fa -CG -r %s %s' % (sanstiny_gtf_file, sanstiny_gtf_file), shell=True) # hash id's by oId tss_id = {} p_id = {} for line in open('cuffcmp.combined.gtf'): a = line.split('\t') kv = gff.gtf_kv(a[8]) tss_id[kv['oId']] = kv['tss_id'] if 'p_id' in kv: p_id[kv['oId']] = kv['p_id'] ############################################################ # add id's and print ############################################################ unsorted_gtf_fd, unsorted_gtf_file = tempfile.mkstemp() unsorted_gtf_out = open(unsorted_gtf_file, 'w') for line in open(sanstiny_gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['tss_id'] = tss_id[kv['transcript_id']] if kv['transcript_id'] in p_id: kv['p_id'] = p_id[kv['transcript_id']] a[8] = gff.kv_gtf(kv) print >> unsorted_gtf_out, '\t'.join(a) unsorted_gtf_out.close() ############################################################ # might as well sort it! ############################################################ subprocess.call('sortBed -i %s' % unsorted_gtf_file, shell=True) ############################################################ # clean ############################################################ # temp os.close(sansrna_gtf_fd) os.remove(sansrna_gtf_file) os.close(sanstiny_gtf_fd) os.remove(sanstiny_gtf_file) os.close(unsorted_gtf_fd) os.remove(unsorted_gtf_file) # cuffcompare os.remove('cuffcmp.tracking') os.remove('cuffcmp.loci') os.remove('cuffcmp.combined.gtf') os.remove('cuffcmp.stats')
def main(): usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>' parser = OptionParser(usage) parser.add_option( '-m', dest='max_genes_overlapped', default=None, type='int', help= 'Don\'t include isoforms that overlap more than this many genes [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide reference GTF and output prerna GTF') else: ref_gtf = args[0] prerna_gtf = args[1] # read transcripts for filtering/processing transcripts = gff.read_genes(ref_gtf, key_id='transcript_id') # add unspliced single exon transcripts to hash prerna_hash = set() for tid in transcripts: tx = transcripts[tid] if len(tx.exons) == 1: tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand) prerna_hash.add(tx_key) # process transcripts prerna_out = open(prerna_gtf, 'w') prerna_index = 0 for tid in transcripts: tx = transcripts[tid] pre_start = tx.exons[0].start pre_end = tx.exons[-1].end pre_key = (tx.chrom, pre_start, pre_end, tx.strand) # print exons for i in range(len(tx.exons)): cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv)) print >> prerna_out, '\t'.join(cols) # print prernas if not pre_key in prerna_hash: prerna_hash.add(pre_key) pre_kv = copy.copy(tx.kv) pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index pre_kv['transcript_type'] = 'prerna' prerna_index += 1 cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv)) print >> prerna_out, '\t'.join(cols) prerna_out.close() if options.max_genes_overlapped != None: # intersect with self and compute overlap sets p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE) tx_overlaps = {} for line in p.stdout: a = line.split('\t') kv1 = gff.gtf_kv(a[8]) tid1 = kv1['transcript_id'] if tid1.startswith('PRERNA'): gid1 = kv1['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: tx_overlaps.setdefault(tid1, set()).add(gid2) p.communicate() # filter into a temp gtf prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp() prerna_out = open(prerna_tmp_file, 'w') for line in open(prerna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] if len(tx_overlaps.get(tid, [])) <= options.max_genes_overlapped: print >> prerna_out, line, prerna_out.close() # rewrite temp to the final output prerna_out = open(prerna_gtf, 'w') for line in open(prerna_tmp_file): print >> prerna_out, line, prerna_out.close() os.close(prerna_tmp_fd) os.remove(prerna_tmp_file)
def main(): usage = "usage: %prog [options] <gtf file>" parser = OptionParser(usage) # parser.add_option() (options, args) = parser.parse_args() if len(args) != 1: parser.error(usage) else: gtf_file = args[0] ############################################ # fix multi-chromosome genes ############################################ # find multi-chromosome genes tx_chrs = {} for line in open(gtf_file): a = line.split("\t") kv = gff.gtf_kv(a[8]) if kv["transcript_id"].startswith("NM_"): tx_chrs.setdefault(kv["transcript_id"], set()).add(a[0]) multi_genes = set([tid for tid in tx_chrs if len(tx_chrs[tid]) > 1]) # revise gtf tx_gene = {} gtf_out = open("tmp.gtf", "w") for line in open(gtf_file): a = line.split("\t") a[-1] = a[-1].rstrip() kv = gff.gtf_kv(a[8]) # if multi-chrom gene, supplement id's if kv["transcript_id"] in multi_genes: kv["transcript_id"] += "c%s" % a[0][3:] a[8] = gff.kv_gtf(kv) # map trans to gene (forget the actual gene id's; they don't consider "_dup") tx_gene[kv["transcript_id"]] = kv["transcript_id"] # print new line print >> gtf_out, "\t".join(a) gtf_out.close() ############################################ # merge transcripts into genes ############################################ # intersect and build overlapping transcript graph G = networkx.Graph() p = subprocess.Popen("intersectBed -f 0.2 -r -wo -s -a tmp.gtf -b tmp.gtf", shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split("\t") tid1 = gff.gtf_kv(a[8])["transcript_id"] tid2 = gff.gtf_kv(a[17])["transcript_id"] G.add_edge(tid1, tid2) line = p.stdout.readline() p.communicate() # combine connected components as genes for component in networkx.algorithms.components.connected.connected_components(G): comp_gene = "G" + tx_gene[component[0]] for tid in component: tx_gene[tid] = comp_gene for tid in tx_gene: if tx_gene[tid][0] != "G": tx_gene[tid] = "G" + tx_gene[tid] ############################################ # output ############################################ # print for line in open("tmp.gtf"): a = line.split("\t") a[-1] = a[-1].rstrip() kv = gff.gtf_kv(a[8]) kv["gene_id"] = tx_gene[kv["transcript_id"]] a[8] = gff.kv_gtf(kv) print "\t".join(a) # clean os.remove("tmp.gtf")
def prerna_gtf(ref_gtf, out_dir): unspliced_index = 0 unspliced_hash = set() transcripts = read_genes(ref_gtf, key_id='transcript_id') pre_ref_gtf = '%s/prerna.gtf' % out_dir pre_ref_open = open(pre_ref_gtf, 'w') # add unspliced single exon transcripts to hash for tid in transcripts: tx = transcripts[tid] if len(tx.exons) == 1: tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand) unspliced_hash.add(tx_key) # process transcripts for tid in transcripts: tx = transcripts[tid] pre_start = tx.exons[0].start pre_end = tx.exons[-1].end pre_key = (tx.chrom, pre_start, pre_end, tx.strand) for i in range(len(tx.exons)): cols = (tx.chrom, 'clip_peaks', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv)) print >> pre_ref_open, '\t'.join(cols) if not pre_key in unspliced_hash: unspliced_hash.add(pre_key) pre_kv = copy.copy(tx.kv) pre_kv['transcript_id'] = 'UNSPLICED%d' % unspliced_index unspliced_index += 1 cols = (tx.chrom, 'clip_peaks', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv)) print >> pre_ref_open, '\t'.join(cols) pre_ref_open.close() return pre_ref_gtf
def main(): usage = 'usage: %prog [options] <ref gtf> <merged gtf>' parser = OptionParser(usage) (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: ref_gtf = args[0] merged_gtf = args[1] # get mappings ref_t2g = gff.t2g(ref_gtf) merged_t2g = gff.t2g(merged_gtf) merged_g2t = gff.g2t(merged_gtf) # hash gene_name's by tid ref_gid_names = {} for line in open(ref_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if 'gene_name' in kv: ref_gid_names[kv['gene_id']] = kv['gene_name'] # hash merged lines by tid merged_tid_lines = {} for line in open(merged_gtf): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] merged_tid_lines.setdefault(tid,[]).append(line) # intialize orphan gene_id orphan_num = 1 for mgene_id in merged_g2t: # count reference genes ref_genes = set() for tid in merged_g2t[mgene_id]: if tid in ref_t2g: ref_genes.add(ref_t2g[tid]) # if no known genes, leave it alone if len(ref_genes) == 0: for tid in merged_g2t[mgene_id]: print ''.join(merged_tid_lines[tid]), # if known gene, set gene_id to it elif len(ref_genes) == 1: new_gene_id = list(ref_genes)[0] for tid in merged_g2t[mgene_id]: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = new_gene_id if new_gene_id in ref_gid_names: kv['gene_name'] = ref_gid_names[new_gene_id] a[8] = gff.kv_gtf(kv) print '\t'.join(a) # if two known genes were combined, fix it elif len(ref_genes) > 1: # compute transcript overlaps and build overlap graph tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines) # map each new transcript to the ref gene_id's overlapped tid_ref_genes = {} for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 not in ref_t2g: tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1]) elif tid1 not in ref_t2g and tid2 in ref_t2g: tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2]) # remove new transcripts overlapping multiple ref gene_id's for tid in tid_ref_genes: if len(tid_ref_genes[tid]) > 1: print >> sys.stderr, 'Removing %s' % tid tid_overlap_graph.remove_node(tid) # remove edges connecting separate reference genes for (tid1,tid2) in tid_overlap_graph.edges(): if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]: tid_overlap_graph.remove_edge(tid1,tid2) # map to new gene_id's; missing means eliminate transcript tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g) for tid in merged_g2t[mgene_id]: if tid in tid_new_gid: for line in merged_tid_lines[tid]: a = line.split('\t') kv = gff.gtf_kv(a[8]) kv['gene_id'] = tid_new_gid[tid] if tid_new_gid[tid] in ref_gid_names: kv['gene_name'] = ref_gid_names[tid_new_gid[tid]] a[8] = gff.kv_gtf(kv) print '\t'.join(a)
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option( '-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform' ) parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+', '-']: print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid) ]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id) ]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.upstream), str(tss + options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols) else: if tss - options.downstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.downstream), str(tss + options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols)