def getGTFcontent(gtf_file): """ Extract GTF features """ GFH = _open_file(gtf_file) gtf_content, recall = dict(), None for rec in GFH: rec = rec.strip('\n\r') #skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue #skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec if re.search(r'^(start_codon|start-codon|startcodon)$', parts[2], re.IGNORECASE): continue gid= tid= gname= tname= ttype = None for attb in parts[-1].split(';'): if re.search(r'^\s?$', attb): continue attb = re.sub('"', '', attb).strip() attb = attb.split() if re.search(r'^(gene_id|geneid|name)$', attb[0], re.IGNORECASE): gid = attb[1] elif re.search(r'^(transcript_id|transcriptId)$', attb[0], re.IGNORECASE): tid = attb[1] elif re.search(r'^(gene_name|genename)$', attb[0], re.IGNORECASE): gname = attb[1] elif re.search(r'^(transcript_name|transcriptname)$', attb[0], re.IGNORECASE): tname = attb[1] elif re.search(r'^(transcript_type)$', attb[0], re.IGNORECASE): ttype = attb[1] if gid == tid: #UCSC GTF files, gene & transcript have same identifier gid = 'Gene:'+str(gid) tid = 'Transcript:'+str(tid) if tid == None: #JGI GTF file dont have transcript ID for CDS line tid = recall exon= cds= sp_cod= st_cod = [] if re.search(r'^exon$', parts[2], re.IGNORECASE): exon = [(int(parts[3]), int(parts[4]))] elif re.search(r'^CDS$', parts[2], re.IGNORECASE): cds = [(int(parts[3]), int(parts[4]))] elif re.search(r'^(stop_codon|stop-codon|stopcodon)$', parts[2], re.IGNORECASE): sp_cod = [(int(parts[3]), int(parts[4]))] else: #other lines are not required to GFF line continue #creating feature connections if parts[0] in gtf_content: # adding to existing chromosome if (gid, parts[1]) in gtf_content[parts[0]].keys(): # adding to existing gene if tid in gtf_content[parts[0]][(gid, parts[1])].keys(): # adding to existing transcript if exon: gtf_content[parts[0]][(gid, parts[1])][tid]['exon'].append(exon[0]) elif cds: gtf_content[parts[0]][(gid, parts[1])][tid]['CDS'].append(cds[0]) elif sp_cod: gtf_content[parts[0]][(gid, parts[1])][tid]['sp_cod'].append(sp_cod[0]) else: # inserting new transcript gtf_content[parts[0]][(gid, parts[1])][tid] = dict(exon = exon, CDS = cds, sp_cod = sp_cod, info = [parts[6], parts[5], gname, tname, ttype]) else: # inserting new gene gtf_content[parts[0]][(gid, parts[1])] = {tid : dict(exon = exon, CDS = cds, sp_cod = sp_cod, info = [parts[6], parts[5], gname, tname, ttype])} else: # inserting new chromosome identifier gtf_content[parts[0]] = {(gid, parts[1]) : {tid : dict(exon = exon, CDS = cds, sp_cod = sp_cod, info = [parts[6], parts[5], gname, tname, ttype])}} recall = tid #set previous id for CDS line GFH.close() return gtf_content
def getGTFcontent(gtf_file): """ Extract GTF features """ GFH = _open_file(gtf_file) gtf_content, recall = dict(), None for rec in GFH: rec = rec.strip('\n\r') #skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue #skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec if re.search(r'^(start_codon|start-codon|startcodon)$', parts[2], re.IGNORECASE): continue gid = tid = gname = tname = ttype = None for attb in parts[-1].split(';'): if re.search(r'^\s?$', attb): continue attb = re.sub('"', '', attb).strip() attb = attb.split() if re.search(r'^(gene_id|geneid|name)$', attb[0], re.IGNORECASE): gid = attb[1] elif re.search(r'^(transcript_id|transcriptId)$', attb[0], re.IGNORECASE): tid = attb[1] elif re.search(r'^(gene_name|genename)$', attb[0], re.IGNORECASE): gname = attb[1] elif re.search(r'^(transcript_name|transcriptname)$', attb[0], re.IGNORECASE): tname = attb[1] elif re.search(r'^(transcript_type)$', attb[0], re.IGNORECASE): ttype = attb[1] if gid == tid: #UCSC GTF files, gene & transcript have same identifier gid = 'Gene:' + str(gid) tid = 'Transcript:' + str(tid) if tid == None: #JGI GTF file dont have transcript ID for CDS line tid = recall exon = cds = sp_cod = st_cod = [] if re.search(r'^exon$', parts[2], re.IGNORECASE): exon = [(int(parts[3]), int(parts[4]))] elif re.search(r'^CDS$', parts[2], re.IGNORECASE): cds = [(int(parts[3]), int(parts[4]))] elif re.search(r'^(stop_codon|stop-codon|stopcodon)$', parts[2], re.IGNORECASE): sp_cod = [(int(parts[3]), int(parts[4]))] else: #other lines are not required to GFF line continue #creating feature connections if parts[0] in gtf_content: # adding to existing chromosome if (gid, parts[1] ) in gtf_content[parts[0]].keys(): # adding to existing gene if tid in gtf_content[parts[0]][( gid, parts[1])].keys(): # adding to existing transcript if exon: gtf_content[parts[0]][(gid, parts[1])][tid]['exon'].append( exon[0]) elif cds: gtf_content[parts[0]][(gid, parts[1])][tid]['CDS'].append( cds[0]) elif sp_cod: gtf_content[parts[0]][( gid, parts[1])][tid]['sp_cod'].append(sp_cod[0]) else: # inserting new transcript gtf_content[parts[0]][(gid, parts[1])][tid] = dict( exon=exon, CDS=cds, sp_cod=sp_cod, info=[parts[6], parts[5], gname, tname, ttype]) else: # inserting new gene gtf_content[parts[0]][(gid, parts[1])] = { tid: dict(exon=exon, CDS=cds, sp_cod=sp_cod, info=[parts[6], parts[5], gname, tname, ttype]) } else: # inserting new chromosome identifier gtf_content[parts[0]] = { (gid, parts[1]): { tid: dict(exon=exon, CDS=cds, sp_cod=sp_cod, info=[parts[6], parts[5], gname, tname, ttype]) } } recall = tid #set previous id for CDS line GFH.close() return gtf_content
pc_final_map = dict() for ptype, ctypes in pc_map.items(): unique_ctypes = list(set(ctypes)) unique_ctypes.sort() pc_final_map[ptype] = unique_ctypes # some cases the GFF file represents a single feature type if not pc_final_map: for fid, stypes in parent_sts.items(): pc_final_map[stypes] = dict() # generate a report on feature id mapping in the file print '------------------------------------------------------' print 'Parent feature type | Associated child feature type(s)' print '------------------------------------------------------' for key, value in pc_final_map.items(): print key[0], key[1] for child_to in value: print '\t\t|',child_to[0], child_to[1] print print '------------------------------------------------------' if __name__=='__main__': try: gff_file = sys.argv[1] except: print "Incorrect arguments supplied" print __doc__ sys.exit(-1) gff_handle = _open_file(gff_file) parent_child_id_map(gff_handle)
def gbk_parse(fname): """ Extract genome annotation recods from genbank format """ fhand = _open_file(gbkfname) unk = 1 for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() tx_tags = collections.defaultdict(list) exon = collections.defaultdict(list) cds = collections.defaultdict(list) mol_type, chr_id = None, None for rec in record.features: if rec.type == 'source': mol_type = rec.qualifiers['mol_type'][0] try: chr_id = rec.qualifiers['chromosome'][0] except: chr_id = record.name continue strand='-' strand='+' if rec.strand>0 else strand fid = None try: fid = rec.qualifiers['gene'][0] except: pass transcript_id = None try: transcript_id = rec.qualifiers['transcript_id'][0] except: pass if re.search(r'gene', rec.type): gene_tags[fid] = (rec.location._start.position+1, rec.location._end.position, strand, rec.type, rec.qualifiers['note'][0]) elif rec.type == 'exon': exon[fid].append((rec.location._start.position+1, rec.location._end.position)) elif rec.type=='CDS': cds[fid].append((rec.location._start.position+1, rec.location._end.position)) else: # get all transcripts if transcript_id: tx_tags[fid].append((rec.location._start.position+1, rec.location._end.position, transcript_id, rec.type)) # record extracted, generate feature table unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) #break fhand.close()
for ptype, ctypes in pc_map.items(): unique_ctypes = list(set(ctypes)) unique_ctypes.sort() pc_final_map[ptype] = unique_ctypes # some cases the GFF file represents a single feature type if not pc_final_map: for fid, stypes in parent_sts.items(): pc_final_map[stypes] = dict() # generate a report on feature id mapping in the file print '------------------------------------------------------' print 'Parent feature type | Associated child feature type(s)' print '------------------------------------------------------' for key, value in pc_final_map.items(): print key[0], key[1] for child_to in value: print '\t\t|', child_to[0], child_to[1] print print '------------------------------------------------------' if __name__ == '__main__': try: gff_file = sys.argv[1] except: print "Incorrect arguments supplied" print __doc__ sys.exit(-1) gff_handle = _open_file(gff_file) parent_child_id_map(gff_handle)
def bed_parse(qfile, source_name): """ Process BED file """ BEDfh = _open_file(qfile) print "##gff-version 3" for rec in BEDfh: rec = rec.strip("\n\r") if not rec or rec[0] in ["#"]: continue if not re.search("\t", rec): continue line = rec.split("\t") assert len(line) >= 12, rec # checking the consistency b/w start of exon and number of exons if len(line[-1].split(",")) != len(line[-2].split(",")): continue rstart = line[-1].split(",") if rstart[-1] == "": rstart.pop() exon_len = line[-2].split(",") if exon_len[-1] == "": exon_len.pop() if line[5] != "+" and line[5] != "-": line[5] = "." # replace the unknown strand with '.' pline = [ str(line[0]), source_name, "gene", str(int(line[1]) + 1), line[2], line[4], line[5], ".", "ID=Gene:" + line[3] + ";Name=Gene:" + line[3], ] print "\t".join(pline) pline = [ str(line[0]), source_name, "transcript", str(int(line[1]) + 1), line[2], line[4], line[5], ".", "ID=" + line[3] + ";Name=" + line[3] + ";Parent=Gene:" + line[3], ] print "\t".join(pline) st = int(line[1]) for ex_cnt in range(int(line[-3])): start = st + int(rstart[ex_cnt]) + 1 stop = start + int(exon_len[ex_cnt]) - 1 if ex_cnt > 0: pline = [ str(line[0]), source_name, "intron", str(intron_start), str(start - 1), line[4], line[5], ".", "Parent=" + line[3], ] print "\t".join(pline) pline = [ str(line[0]), source_name, "exon", str(start), str(stop), line[4], line[5], ".", "Parent=" + line[3], ] print "\t".join(pline) intron_start = stop + 1 BEDfh.close()
def gbk_parse(fname): """ Extract genome annotation recods from genbank format """ fhand = _open_file(gbkfname) unk = 1 for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() tx_tags = collections.defaultdict(list) exon = collections.defaultdict(list) cds = collections.defaultdict(list) mol_type, chr_id = None, None for rec in record.features: if rec.type == 'source': mol_type = rec.qualifiers['mol_type'][0] try: chr_id = rec.qualifiers['chromosome'][0] except: chr_id = record.name continue strand = '-' strand = '+' if rec.strand > 0 else strand fid = None try: fid = rec.qualifiers['gene'][0] except: pass transcript_id = None try: transcript_id = rec.qualifiers['transcript_id'][0] except: pass if re.search(r'gene', rec.type): gene_tags[fid] = (rec.location._start.position + 1, rec.location._end.position, strand, rec.type, rec.qualifiers['note'][0]) elif rec.type == 'exon': exon[fid].append((rec.location._start.position + 1, rec.location._end.position)) elif rec.type == 'CDS': cds[fid].append((rec.location._start.position + 1, rec.location._end.position)) else: # get all transcripts if transcript_id: tx_tags[fid].append( (rec.location._start.position + 1, rec.location._end.position, transcript_id, rec.type)) # record extracted, generate feature table unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) #break fhand.close()