def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ parser = GFFParser() recs = SeqIO.to_dict(parser.parse(self._test_gff_file)) out_handle = StringIO.StringIO() writer = GFF3Writer() writer.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) recs_two = SeqIO.to_dict(parser.parse(wrote_handle)) orig_rec = recs.values()[0] re_rec = recs.values()[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i])
def t_unknown_seq(self): """Prepare unknown base sequences with the correct length. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file)) assert len(rec_dict["I"].seq) == 12766937 assert len(rec_dict["X"].seq) == 17718531
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True, species=None, rename=False): parser = GFFParser() seqids = parser.parse(gff_file, None) cur_chr = None cur_gene_order = 0 for seqid in seqids: for feat in seqid.features: subf = feat.sub_features if feat.type in ("chromosome", "protein"): continue is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\ feat.type=="gene" if cds == is_cds: cur_gene_order += 1 if species != None: seqid_final = species + seqid.id[-2:] # hard coded else: seqid_final = seqid.id if rename: if seqid.id != cur_chr: cur_gene_order = 1 cur_chr = seqid.id gene_name = seqid_final + 'g' + '0' * ( 5 - len(str(cur_gene_order))) + str(cur_gene_order) else: gene_name = feat.id print >>bed_fh, "\t".join(str(x) for x in (seqid_final, int(str(feat.location.start))+1, \ feat.location.end, gene_name)) # +1 is hard coded to current BCBio.GFF
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True, species=None, rename=False): parser = GFFParser() seqids = parser.parse(gff_file, None) cur_chr = None cur_gene_order = 0 for seqid in seqids: for feat in seqid.features: subf = feat.sub_features if feat.type in ("chromosome", "protein"): continue is_cds = any(f.type == "mRNA" or f.type == "CDS" for f in subf) and feat.type == "gene" if cds == is_cds: cur_gene_order += 1 if species != None: seqid_final = species + seqid.id[-2:] # hard coded else: seqid_final = seqid.id if rename: if seqid.id != cur_chr: cur_gene_order = 1 cur_chr = seqid.id gene_name = seqid_final + "g" + "0" * (5 - len(str(cur_gene_order))) + str(cur_gene_order) else: gene_name = feat.id print >> bed_fh, "\t".join( str(x) for x in (seqid_final, int(str(feat.location.start)) + 1, feat.location.end, gene_name) ) # +1 is hard coded to current BCBio.GFF
def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) parser = GFFParser() for rec in parser.parse(gff_file, seq_dict, limit_info=limit_info): pass
def t_ensembl_nested_features(self): """Test nesting of features with GFF2 files using transcript_id. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._ensembl_file)) assert len(rec_dict["I"].features) == 2 t_feature = rec_dict["I"].features[0] assert len(t_feature.sub_features) == 32
def t_gff3_noval_attrib(self): """Parse GFF3 file from NCBI with a key/value pair with no value. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi)) assert len(rec_dict) == 1 t_feature = rec_dict.values()[0].features[0] assert t_feature.qualifiers["pseudo"] == ["true"]
def t_fasta_directive(self): """Parse FASTA sequence information contained in a GFF3 file. """ parser = GFFParser() recs = SeqIO.to_dict(parser.parse(self._gff_file)) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA"
def parse(gff_content, source=None): gff_parser = GFFParser() gff = gff_parser.parse(io.StringIO(gff_content)) records = [] for record in gff: records.append(_create_record_model(record, source)) if len(records) >= 1: return records[0]
def t_gff_annotations(self): """Check GFF annotations placed on an entire sequence. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file)) final_rec = rec_dict['I'] assert len(final_rec.annotations.keys()) == 2 assert final_rec.annotations['source'] == ['Expr_profile'] assert final_rec.annotations['expr_profile'] == ['B0019.1']
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True): parser = GFFParser() seqids = parser.parse(gff_file, None) for seqid in seqids: for feat in seqid.features: print >> bed_fh, "\t".join( str(x) for x in (seqid.id, feat.location.start, feat.location.end, feat.id, feat.type))
def get_feature_cord(gff_file,user_feature="gene"): """returns st, stop of a feature with orintation as tuple in a list to account for CDS in a dictionary with gene id as key:[(st,stop,orin)] freature can be one of 'protein','gene','mRNA','CDS','exon' user_feature="gene" a_dict['FBgn0031208']= [(7528, 9484, 1, '2L')] user_feature = "mRNA". a_dict['FBgn0031208']=[(7528, 9484, 1, 'FBtr0300689', '2L'), (7528, 9484, 1, 'FBtr0300690', '2L'), (7528, 9484, 1, 'FBtr0330654', '2L')] user_feature = "CDS" a_dict['FBgn0031208']= [([(7679, 8116), (8192, 8610)], 1, 'FBtr0300689', '2L'), ([(7679, 8116), (8192, 8589), (8667, 9276)], 1, 'FBtr0300690', '2L'), ([(7679, 8116), (8228, 8610)], 1, 'FBtr0330654', '2L')] """ limit_info = dict(gff_type = ['protein','gene','mRNA','CDS','exon']) feature_dict = {} parser = GFFParser() in_handle = open(gff_file) for rec in parser.parse(in_handle,limit_info=limit_info): rec_id = rec.id for feat in rec.features: if feat.type == "gene": gene_id = feat.id if user_feature == "gene": #gene_id = feat.id assert gene_id not in feature_dict feature_dict[gene_id]=[(feat.location.start.position, feat.location.end.position,feat.strand,rec_id)] else: for sub in feat.sub_features: if sub.type == "mRNA": if user_feature == "mRNA": info = (sub.location.start.position, sub.location.end.position,sub.strand,sub.id,rec_id) if gene_id in feature_dict: feature_dict[gene_id].append(info) else: feature_dict[gene_id] = [info] else: codons = [] for sub_sub in sub.sub_features: if sub_sub.type == "CDS": st = sub_sub.location.start.position end = sub_sub.location.end.position codons.append((st,end)) info = (codons,sub.strand,sub.id,rec_id) if gene_id in feature_dict: feature_dict[gene_id].append(info) else: feature_dict[gene_id] = [info] in_handle.close() return feature_dict
def t_gff3_multiple_ids(self): """Deal with GFF3 with non-unique ID attributes, using NCBI example. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi)) assert len(rec_dict) == 1 t_features = rec_dict.values()[0].features[1:] # 4 feature sets, same ID, different positions, different attributes assert len(t_features) == 4 for f in t_features: assert len(f.sub_features) == 3
def t_no_dict_error(self): """Ensure an error is raised when no dictionary to map to is present. """ parser = GFFParser(create_missing=False) try: for rec in parser.parse(self._test_gff_file): pass # no error -- problem raise AssertionError('Did not complain with missing dictionary') except KeyError: pass
def t_wb_cds_nested_features(self): """Nesting of GFF2 features with a flat CDS key value pair. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._wb_alt_file)) assert len(rec_dict) == 2 features = rec_dict.values()[1].features assert len(features) == 1 tfeature = features[0] assert tfeature.id == "cr01.sctg102.wum.2.1" assert len(tfeature.sub_features) == 7
def extract_seq(gff_file,outfile): '''for gff with seq attached goes through and parses out to seq rec as fasta to a new file''' in_handle = open(gff_file) fasta_file = open(outfile,"w") parser = GFFParser() for rec in parser.parse(in_handle):#, limit_info=limit_info): #rec_seq = rec.seq.tostring() SeqIO.write(rec,fasta_file,"fasta") in_handle.close() fasta_file.close()
def t_local_map_reduce(self): """General map reduce framework without parallelization. """ cds_limit_info = dict( gff_type = ["gene", "mRNA", "CDS"], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info)) test_rec = rec_dict['I'] assert len(test_rec.features) == 32
def t_jgi_gff(self): """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._jgi_file)) tfeature = rec_dict['chr_1'].features[0] assert tfeature.location.nofuzzy_start == 37060 assert tfeature.location.nofuzzy_end == 38216 assert tfeature.type == 'inferred_parent' assert len(tfeature.sub_features) == 6 sfeature = tfeature.sub_features[1] assert sfeature.qualifiers['proteinId'] == ['873'] assert sfeature.qualifiers['phase'] == ['0']
def t_basic_directives(self): """Parse out top level meta-data supplied in a GFF3 file. """ parser = GFFParser() recs = SeqIO.to_dict(parser.parse(self._gff_file)) anns = recs['chr17'].annotations assert anns['gff-version'] == ['3'] assert anns['attribute-ontology'] == ['baz'] assert anns['feature-ontology'] == ['bar'] assert anns['source-ontology'] == ['boo'] assert anns['sequence-region'] == [('foo', '1', '100'), ('chr17', '62467934', '62469545')]
def gff_to_bed(gff_file, bed_fh=sys.stdout, cds=True): parser = GFFParser() seqids = parser.parse(gff_file, None) for seqid in seqids: for feat in seqid.features: subf = feat.sub_features if feat.type in ("chromosome", "protein"): continue is_cds = any(f.type=="mRNA" or f.type=="CDS" for f in subf) and\ feat.type=="gene" if cds == is_cds: print >>bed_fh, "\t".join(str(x) for x in (seqid.id, feat.location.start, \ feat.location.end, feat.id, feat.type))
def t_tricky_semicolons(self): """Parsing of tricky semi-colon positions in WormBase GFF2. """ limit_info = dict( gff_source_type = [('Genomic_canonical', 'region')] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file, limit_info=limit_info)) work_rec = rec_dict['I'] assert len(work_rec.features) == 1 test_feature = work_rec.features[0] assert test_feature.qualifiers['Note'] == \ ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162']
def t_flat_features(self): """Check addition of flat non-nested features to multiple records. """ seq_dict = self._get_seq_dict() pcr_limit_info = dict( gff_source_type = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=pcr_limit_info)) assert len(rec_dict['I'].features) == 4 assert len(rec_dict['X'].features) == 5
def t_gff3_iterator_limit(self): """Iterated interface using a limit query on GFF3 files. """ cds_limit_info = dict(gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id=['I']) parser = GFFParser() rec_dict = SeqIO.to_dict( parser.parse(self._test_gff_file, limit_info=cds_limit_info)) assert len(rec_dict) == 1 tfeature = rec_dict["I"].features[0].sub_features[0] for sub_test in tfeature.sub_features: assert sub_test.type == "CDS", sub_test
def t_basic_solid_parse(self): """Basic parsing of SOLiD GFF results files. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file)) test_feature = rec_dict['3_341_424_F3'].features[0] assert test_feature.location.nofuzzy_start == 102716 assert test_feature.location.nofuzzy_end == 102736 assert len(test_feature.qualifiers) == 7 assert test_feature.qualifiers['score'] == ['10.6'] assert test_feature.qualifiers['source'] == ['solid'] assert test_feature.strand == -1 assert test_feature.type == 'read' assert test_feature.qualifiers['g'] == ['T2203031313223113212'] assert len(test_feature.qualifiers['q']) == 20
def t_wormbase_nested_features(self): """Test nesting of features with GFF2 files using Transcript only. """ parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._wormbase_file)) assert len(rec_dict) == 3 parent_features = [f for f in rec_dict["I"].features if f.type == "Transcript"] assert len(parent_features) == 1 inferred_features = [f for f in rec_dict["I"].features if f.type == "inferred_parent"] assert len(inferred_features) == 0 tfeature = parent_features[0] assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797" assert len(tfeature.sub_features) == 46
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated db_name = "orphan.db" biodb_name = 'metagenomic_database' print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict )#, limit_info=limit_info) for r in recs: print r.features[0]
def t_gff3_iterator_limit(self): """Iterated interface using a limit query on GFF3 files. """ cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info)) assert len(rec_dict) == 1 tfeature = rec_dict["I"].features[0].sub_features[0] for sub_test in tfeature.sub_features: assert sub_test.type == "CDS", sub_test
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated user = "******" passwd = "cdev" host = "localhost" db_name = "wb199_gff" biodb_name = "wb199_gff_cds_pcr" # These need to be updated to reflect what you would like to parse # out of the GFF file. Set limit_info=None to parse everything, but # be sure the file is small or you may deal with memory issues. rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type=rnai_types + gene_types) # -- print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict, limit_info=limit_info) print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver="MySQLdb", user=user, passwd=passwd, host=host, db=db_name) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_database(biodb_name) db = server[biodb_name] db.load(recs) server.adaptor.commit() except: server.adaptor.rollback() raise
def t_line_adjust(self): """Adjust lines during parsing to fix potential GFF problems. """ def adjust_fn(results): rec_index = results['quals']['i'][0] read_name = results['rec_id'] results['quals']['read_name'] = [read_name] results['rec_id'] = rec_index return results parser = GFFParser(line_adjust_fn=adjust_fn) recs = [r for r in parser.parse(self._test_gff_file)] assert len(recs) == 1 work_rec = recs[0] assert work_rec.id == '1' assert len(work_rec.features) == 112 assert work_rec.features[0].qualifiers['read_name'] == \ ['3_336_815_F3']
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database # XXX demo example could be swapped to use SQLite when that is integrated user = "******" passwd = "wubin" host = "localhost" db_name = "volvoxdb" biodb_name = "volvoxdb_cds_pcr" # These need to be updated to reflect what you would like to parse # out of the GFF file. Set limit_info=None to parse everything, but # be sure the file is small or you may deal with memory issues. rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) # -- print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." parser = GFFParser() recs = parser.parse(gff_file, seq_dict, limit_info=limit_info) print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver="MySQLdb", user=user, passwd=passwd, host=host, db=db_name) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_database(biodb_name) db = server[biodb_name] db.load(recs) server.adaptor.commit() except: server.adaptor.rollback() raise
def locus_tag2record_info(want, infile): """ returns a tuple containing record info given a GFF file and locus_tag to locate """ parser = GFFParser() with open(infile, "r") as gff: record_info = (None, None, None, None, None) for record in parser.parse(gff, limit_info=dict(gff_type=["gene", "CDS", "locus_tag", "product"])): for feature in record.features: if feature.type == "gene" and "locus_tag" in feature.qualifiers: locus_tag = feature.qualifiers.get("locus_tag", None) if want in locus_tag: record_info = ( feature.location.start.position, feature.location.end.position, feature.strand, record.id, record.seq.tostring(), ) return record_info
def t_basic_attributes(self): """Parse out basic attributes of GFF2 from Ensembl GTF. """ limit_info = dict( gff_source_type = [('snoRNA', 'exon')] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._ensembl_file, limit_info=limit_info)) work_rec = rec_dict['I'] assert len(work_rec.features) == 1 test_feature = work_rec.features[0] qual_keys = test_feature.qualifiers.keys() qual_keys.sort() assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name', 'source', 'transcript_id', 'transcript_name'] assert test_feature.qualifiers['source'] == ['snoRNA'] assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2'] assert test_feature.qualifiers['exon_number'] == ['1']
def t_nested_features(self): """Check three-deep nesting of features with gene, mRNA and CDS. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # first gene feature is plain assert len(final_rec.features) == 2 # two gene feature assert len(final_rec.features[0].sub_features) == 1 # one transcript # 15 final CDS regions assert len(final_rec.features[0].sub_features[0].sub_features) == 15
def main(gff_file,th_fasta): parser = GFFParser() #parser = GFFExaminer() seqids = parser.parse(gff_file, None) #seqids = parser.parent_child_map(gff_file) fasta = Fasta(th_fasta, flatten_inplace=True) out_fasta = open('this_is_a_test','w') for i,seqid in enumerate(seqids): ss= condens_transcript(seqid.features) for i,feat in enumerate(ss): #print feat ids = [] has_cds = False ids.append(feat.id) for subf in feat.sub_features: if str(feat.type) == 'CDS' or feat.type == 'gene' or feat.type == 'protein': has_cds = True if has_cds: continue print >>out_fasta, '>%s' %ids[0] print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start):int(feat.location.end)]
def main(gff_file, outdir, th_fasta): """empty docstring""" parser = GFFParser() seqids = parser.parse(gff_file,None) fasta = Fasta(th_fasta, flatten_inplace=True) out_fasta = open(outdir + "/at_no_cds.fasta", "w") for seqid in seqids: seq_features = conden_transcripts(seqid.features) for feat in seq_features: has_cds = False ids = [] ids.append(feat.id) for subf in feat.sub_features: if subf.type == 'CDS' or subf.type == 'chromosome': has_cds = True if has_cds: continue #non_cds_feats.append(feat) print >>out_fasta, ">%s" % ids[0] print >>out_fasta, fasta[seqid.id.lower()][int(feat.location.start) : int(feat.location.end)]
def main(gff_file, outdir, th_fasta): """empty docstring""" parser = GFFParser() seqids = parser.parse(gff_file, None) fasta = Fasta(th_fasta, flatten_inplace=True) out_fasta = open(outdir + "/at_no_cds.fasta", "w") for seqid in seqids: seq_features = conden_transcripts(seqid.features) for feat in seq_features: has_cds = True ids = [] ids.append(feat.id) for subf in feat.sub_features: if subf.type in set( ['tRNA', 'rRNA', 'miRNA', 'snoRNA', 'ncRNA', 'snRNA']): has_cds = False if has_cds: continue #non_cds_feats.append(feat) print >> out_fasta, ">%s" % ids[0] print >> out_fasta, fasta[seqid.id.lower( )][int(feat.location.start):int(feat.location.end)]
def t_nested_multiparent_features(self): """Verify correct nesting of features with multiple parents. """ seq_dict = self._get_seq_dict() cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) parser = GFFParser() rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info)) final_rec = rec_dict['I'] # second gene feature is multi-parent assert len(final_rec.features) == 2 # two gene feature cur_subs = final_rec.features[1].sub_features assert len(cur_subs) == 3 # three transcripts # the first and second transcript have the same CDSs assert len(cur_subs[0].sub_features) == 6 assert len(cur_subs[1].sub_features) == 6 assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]