def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True): parser = GFFParser() seqids = parser.parse(gff_file, None) for seqid in seqids: for feat in seqid.features: subf = feat.sub_features if feat.type in ("chromosome", "protein"): continue is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\ feat.type=="gene" if cds == is_cds: print >>bed_fh, "\t".join(str(x) for x in (seqid.id, feat.location.start, \ feat.location.end, feat.id, feat.type))
def t_flat_features(self): """Check addition of flat non-nested features to multiple records. """ seq_dict = self._get_seq_dict() pcr_limit_info = dict( gff_source_type = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=pcr_limit_info)) assert len(rec_dict['I'].features) == 4 assert len(rec_dict['X'].features) == 5
def t_gff3_iterator_limit(self): """Iterated interface using a limit query on GFF3 files. """ cds_limit_info = dict(gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I']) parser = GFFParser() rec_dict = SeqIO.to_dict( parser.parse(self._test_gff_file, limit_info=cds_limit_info)) assert len(rec_dict) == 1 tfeature = rec_dict["I"].features[0].sub_features[0] for sub_test in tfeature.sub_features: assert sub_test.type == "CDS", sub_test
def t_basic_solid_parse(self): """Basic parsing of SOLiD GFF results files. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file)) test_feature = rec_dict['3_341_424_F3'].features[0] assert test_feature.location.nofuzzy_start == 102716 assert test_feature.location.nofuzzy_end == 102736 assert len(test_feature.qualifiers) == 7 assert test_feature.qualifiers['score'] == ['10.6'] assert test_feature.qualifiers['source'] == ['solid'] assert test_feature.strand == -1 assert test_feature.type == 'read' assert test_feature.qualifiers['g'] == ['T2203031313223113212'] assert len(test_feature.qualifiers['q']) == 20
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated db_name = "orphan.db" biodb_name = 'metagenomic_database' print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict )#, limit_info=limit_info) for r in recs: print r.features[0]
def t_line_adjust(self): """Adjust lines during parsing to fix potential GFF problems. """ def adjust_fn(results): rec_index = results['quals']['i'][0] read_name = results['rec_id'] results['quals']['read_name'] = [read_name] results['rec_id'] = rec_index return results parser = GFFParser(line_adjust_fn=adjust_fn) recs = [r for r in parser.parse(self._test_gff_file)] assert len(recs) == 1 work_rec = recs[0] assert work_rec.id == '1' assert len(work_rec.features) == 112 assert work_rec.features[0].qualifiers['read_name'] == \ ['3_336_815_F3']
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated user = "******" passwd = "cdev" host = "localhost" db_name = "wb199_gff" biodb_name = "wb199_gff_cds_pcr" # These need to be updated to reflect what you would like to parse # out of the GFF file. Set limit_info=None to parse everything, but # be sure the file is small or you may deal with memory issues. rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type=rnai_types + gene_types) # -- print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict, limit_info=limit_info) print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver="MySQLdb", user=user, passwd=passwd, host=host, db=db_name) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_database(biodb_name) db = server[biodb_name] db.load(recs) server.adaptor.commit() except: server.adaptor.rollback() raise
def t_nested_features(self): """Check three-deep nesting of features with gene, mRNA and CDS. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # first gene feature is plain assert len(final_rec.features) == 2 # two gene feature assert len(final_rec.features[0].sub_features) == 1 # one transcript # 15 final CDS regions assert len(final_rec.features[0].sub_features[0].sub_features) == 15
def main(gff_file,th_fasta): parser = GFFParser() #parser = GFFExaminer() seqids = parser.parse(gff_file, None) #seqids = parser.parent_child_map(gff_file) fasta = Fasta(th_fasta, flatten_inplace=True) out_fasta = open('this_is_a_test','w') for i,seqid in enumerate(seqids): ss= condens_transcript(seqid.features) for i,feat in enumerate(ss): #print feat ids = [] has_cds = False ids.append(feat.id) for subf in feat.sub_features: if str(feat.type) == 'CDS' or feat.type == 'gene' or feat.type == 'protein': has_cds = True if has_cds: continue print >>out_fasta, '>%s' %ids[0] print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start):int(feat.location.end)]
def t_nested_multiparent_features(self): """Verify correct nesting of features with multiple parents. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # second gene feature is multi-parent assert len(final_rec.features) == 2 # two gene feature cur_subs = final_rec.features[1].sub_features assert len(cur_subs) == 3 # three transcripts # the first and second transcript have the same CDSs assert len(cur_subs[0].sub_features) == 6 assert len(cur_subs[1].sub_features) == 6 assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]
def main(gff_file, outdir, th_fasta): """empty docstring""" parser = GFFParser() seqids = parser.parse(gff_file, None) fasta = Fasta(th_fasta, flatten_inplace=True) out_fasta = open(outdir + "/at_no_cds.fasta", "w") for seqid in seqids: seq_features = conden_transcripts(seqid.features) for feat in seq_features: has_cds = True ids = [] ids.append(feat.id) for subf in feat.sub_features: if subf.type in set( ['tRNA', 'rRNA', 'miRNA', 'snoRNA', 'ncRNA', 'snRNA']): has_cds = False if has_cds: continue #non_cds_feats.append(feat) print >> out_fasta, ">%s" % ids[0] print >> out_fasta, fasta[seqid.id.lower( )][int(feat.location.start):int(feat.location.end)]