def write_format(file): """Write a GenBank record from a Genbank file and compare them.""" record_parser = GenBank.RecordParser(debug_level=2) print("Testing GenBank writing for %s..." % os.path.basename(file)) # be able to handle gzipped files if ".gz" in file: cur_handle = gzip.open(file, "r") compare_handle = gzip.open(file, "r") else: cur_handle = open(file, "r") compare_handle = open(file, "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while True: cur_record = next(iterator) compare_record = next(compare_iterator) if cur_record is None or compare_record is None: break # print("\tTesting for %s" % cur_record.version) output_record = str(cur_record) + "\n" try: do_comparison(compare_record, output_record) except AssertionError as msg: print("\tTesting for %s" % cur_record.version) print(msg) cur_handle.close() compare_handle.close()
def t_ensembl_locus(): line = "LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG531_PATCH", c.data.name assert c._expected_size == 1000000, c._expected_size line = "LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG531_PATCH", c.data.name assert c._expected_size == 759984, c._expected_size line = "LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name assert c._expected_size == 814959, c._expected_size line = "LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name assert c._expected_size == 1219964, c._expected_size print("Done")
def write_format(file): record_parser = GenBank.RecordParser(debug_level = 2) print "Testing GenBank writing for %s..." % os.path.basename(file) # be able to handle gzipped files if '.gz' in file: cur_handle = gzip.open(file, "r") compare_handle = gzip.open(file, "r") else: cur_handle = open(file, "r") compare_handle = open(file, "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while 1: cur_record = iterator.next() compare_record = compare_iterator.next() if cur_record is None or compare_record is None: break # print "\tTesting for %s" % cur_record.version output_record = str(cur_record) + "\n" try: do_comparison(compare_record, output_record) except AssertionError, msg: print "\tTesting for %s" % cur_record.version print msg
def setUp(self): # create TESTDB create_database() # load the database db_name = "biosql-test" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) # remove the database if it already exists try: server[db_name] server.remove_database(db_name) except KeyError: pass self.db = server.new_database(db_name) # get the GenBank file we are going to put into it input_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb") handle = open(input_file, "r") parser = GenBank.FeatureParser() self.iterator = GenBank.Iterator(handle, parser)
def search(self): if self.database == 'PubMed': from Bio import PubMed from Bio import GenBank searchIds = PubMed.search_for(self.searchTerm, max_ids=self.maxResults) GBrecParser = GenBank.FeatureParser() ncbiDict = GenBank.NCBIDictionary(self.type, 'genbank', parser=GBrecParser) from Bio import Medline MLrecParser = Medline.RecordParser() medlineDict = PubMed.Dictionary(delay=1.0, parser=MLrecParser) for id in searchIds: MLrecord = medlineDict[id] GBrecord = ncbiDict[id] newDBItem = DBItem(self.project, seq=GBrecord.seq, descript=GBrecord.description, id=id, record=MLrecord) self.items[id] = newDBItem
def t_write_format(): record_parser = GenBank.RecordParser(debug_level=0) for file in write_format_files: print("Testing GenBank writing for %s..." % os.path.basename(file)) cur_handle = open(os.path.join("GenBank", file), "r") compare_handle = open(os.path.join("GenBank", file), "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while True: cur_record = next(iterator) compare_record = next(compare_iterator) if cur_record is None or compare_record is None: break print("\tTesting for %s" % cur_record.version) output_record = str(cur_record) + "\n" do_comparison(compare_record, output_record) cur_handle.close() compare_handle.close()
def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'): """ download NCBI refseq GenBank file from strain list """ import os, sys, time, glob, csv from Bio import GenBank from sf_miscellaneous import write_pickle #species_to_search ## fetch the newest refseq assembly_summary file os.system('wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt'%path) with open('assembly_summary.txt','rb') as csvfile: outfile='downloadlink.txt' with open(path+outfile,'wb') as output: csv_reader = csv.reader(csvfile, delimiter='\t') headers = csv_reader.next() for icsv_line in csv_reader: # species name and complete if species_to_search in icsv_line[7] and 'Complete' in icsv_line[11]: #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt')) output.write('%s/%s\n'%(icsv_line[19],'*_genomic.gbff.gz')) gbk_path='%sinput_GenBank/'%path command_download='wget -c --input %sdownloadlink.txt -P %s'%(path,gbk_path) os.system(command_download) command_gunzip='gunzip %s*.gz'%gbk_path os.system(command_gunzip) for each_gbk_path in glob.iglob('%s*gbff*'%gbk_path): with open(each_gbk_path) as gbk_file: for record in GenBank.parse(gbk_file): print(each_gbk_path,record.accession[0]) break os.system('mv %s %s%s.gbk'%(each_gbk_path, gbk_path, record.accession[0])) if 0: os.chdir(path) species=glob.glob('*txt')[0].split('_list.')[0] os.system('rm *txt; rm *sh') os.system('gunzip *') while len(glob.glob('*.gz'))!=0: time.sleep(5) # rename gbk file for each_gbk_path in glob.iglob('*gbff*'): with open(each_gbk_path) as handle: print handle for record in GenBank.parse(handle): print(each_gbk_path,record.accession[0]) break os.system('mv %s %s'%(each_gbk_path, record.accession[0])) for each_gbk_path in glob.iglob('*'): os.system('mv %s %s.gbk'%(each_gbk_path, each_gbk_path)) #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0])) os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt'%species) os.system('wc -l *txt ; ls *gbk |wc -l') path='../../pan-genome-analysis/' os.system('cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh'%(path,path,species,species,path,species)) os.system('mv ../%s/ %sdata/'%(species,path))
def loadData(self, data, dbtype): if (dbtype == "GenBank"): # get the GenBank file we are going to put into it parser = GenBank.FeatureParser() iterator = GenBank.Iterator(data, parser) # finally put it in the database try: self.getDatabase().load(iterator) except: self.getBioSQLRoot().getDBServer().adaptor.conn.rollback() return traceback.format_exc() self.getBioSQLRoot().getDBServer().adaptor.conn.commit() return "" else: raise "Unknown dbtype: %r" % (dbtype)
def from_genbank(cls, filepath: str): try: gb = GenBank.read(file=filepath) source = GenBank.get_source_data(gb) return cls.insert( accession=gb.accession[0], organism=gb.organism, date_released=GenBank.format_date(gb.date), host=source.get('host'), date_collected=source.get('collection_date'), country=source.get('country'), ) except Exception as e: logging.warning( f"Error inserting {filepath} to {cls.__name__}: {e}")
def plot_unique_genome_diagram(gbk, unique_loci): parser = GenBank.FeatureParser() fhandle = open(gbk, 'r') genbank_entry = parser.parse(fhandle) fhandle.close() gdd = GenomeDiagram.Diagram(gbk) gd_track_for_features = gdd.new_track(1, name="CDS", scale_smalltick_interval=100000) gdfs = gd_track_for_features.new_set() for feature in genbank_entry.features: if feature.type == 'CDS': feature.strand = 1 if feature.qualifiers['locus_tag'][0] in unique_loci: gdfs.add_feature(feature, color=rcolors.HexColor("#93341F")) else: gdfs.add_feature(feature, color=rcolors.HexColor("#058F45")) gdd.draw(format='circular', orientation='landscape', tracklines=0, pagesize='A5', fragments=5, circular=1) return gdd
def pLonk(plasmids): pLenks = [] pLasmids = [] for (pName, seq_infile, offset, order) in plasmids: fhandle = open(seq_infile, 'r') # load plasmid sequence file # evaluate file name to detect format using Quixote [ filename ] format = Quixote(seq_infile) if format == 'genbank': parser = GenBank.FeatureParser() gb_entry = parser.parse(fhandle) pLen = len(gb_entry.seq) # read in length of plasmid sequence print pName, pLen elif format == 'fasta' or format == 'seq': for fa_entry in SeqIO.parse(fhandle, "fasta"): pLen = len(fa_entry.seq) # read in length of plasmid sequence else: print "TERMINAL ERROR : file format not recognized for " + pName + " !!!" break fhandle.close() # close sequence file (to free up memory) pLenks.append(pLen) pLasmids.append((pName, seq_infile, int(pLen), int(offset))) pLenks.sort() pLenks.reverse() pLen_MAX = pLenks[0] return pLenks, pLen_MAX, pLasmids
def main(): sorted_pos_tagged = {} db_handler = DatabaseHandler() for entry in os.scandir(shb_records): graph = rdflib.Graph() graph.load(entry.path) for s, p, o in graph: if p in predicate_whitelist and isinstance(o, rdflib.Literal): pos_tagged = mine_sentences(o) for p in pos_tagged: if p[1] in sorted_pos_tagged.keys(): sorted_pos_tagged[p[1]].append(p[0]) else: sorted_pos_tagged[p[1]] = [p[0]] for entry in os.scandir(gbk_records): with open(entry) as handle: for record in GenBank.parse(handle): graph = db_handler._db_util.db_mapping_calls[ "genbank"].generalise_get_results(record) for s, p, o in graph: if isinstance(o, rdflib.Literal): pos_tagged = mine_sentences(o) for p in pos_tagged: if p[1] in sorted_pos_tagged.keys(): sorted_pos_tagged[p[1]].append(p[0]) else: sorted_pos_tagged[p[1]] = [p[0]] f = open("summary.txt", "a+") for k, v in sorted_pos_tagged.items(): v = list(set(v)) print(k, v) print("\n") f.write(f'{k} - {"-".join(v)}\n')
def t_cleaning_features(): """Test the ability to clean up feature values.""" gb_parser = GenBank.FeatureParser( feature_cleaner=utils.FeatureValueCleaner()) handle = open(os.path.join("GenBank", "arab1.gb")) iterator = GenBank.Iterator(handle, gb_parser) first_record = next(iterator) # test for cleaning of translation translation_feature = first_record.features[1] test_trans = translation_feature.qualifiers["translation"][0] assert " " not in test_trans, "Did not clean spaces out of the translation" assert "\012" not in test_trans, "Did not clean newlines out of the translation" handle.close()
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def load_genbank(seqfile): """Load single-record GenBank file.""" parser = GenBank.FeatureParser() input_handle = open(seqfile, 'rU') gb_record = parser.parse(input_handle) input_handle.close() return gb_record
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
def t_cleaning_features(): """Test the ability to clean up feature values. """ parser = GenBank.FeatureParser(feature_cleaner = \ utils.FeatureValueCleaner()) handle = open(os.path.join("GenBank", "arab1.gb")) iterator = GenBank.Iterator(handle, parser) first_record = iterator.next() # test for cleaning of translation translation_feature = first_record.features[1] test_trans = translation_feature.qualifiers["translation"][0] assert test_trans.find(" ") == -1, \ "Did not clean spaces out of the translation" assert test_trans.find("\012") == -1, \ "Did not clean newlines out of the translation"
def test_genbank_bad_loc_wrap_parsing(self): with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning) with open(path.join("GenBank", "bad_loc_wrap.gb")) as handle: record = GenBank.read(handle) self.assertEqual(1, len(record.features)) loc = record.features[0].location self.assertEqual(loc, "join(3462..3615,3698..3978,4077..4307,4408..4797,4876..5028,5141..5332)")
def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff + ".sorted.prepared", accession=genbank_id) #print setting.DATABASES['default']['USER'] gffRewriter.addUnknownCvTerms({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': 'go' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args = [ '--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir" ] runProgram('gmod_bulk_load_gff3.pl', args)
def main(): try: os.mkdir(OUTDIR) except OSError: print 'Using existing directory %s' % OUTDIR with open(INFILE) as handle: records = [r for r in GenBank.parse(handle)] c = Counter([r.references[0].pubmed_id for r in records]) del c[''] pubs = c.keys() seqdict = {k:[r for r in records if r.references[0].pubmed_id == k] for k in pubs} for pub in pubs: qfas(seqdict[pub], OUTDIR+'/'+pub+'.fasta')
def test_000_genbank_bad_loc_wrap_warning(self): with warnings.catch_warnings(): warnings.simplefilter("error", BiopythonParserWarning) with open(path.join("GenBank", "bad_loc_wrap.gb")) as handle: # self.assertRaises(BiopythonParserWarning, GenBank.read, handle) try: record = GenBank.read(handle) except BiopythonParserWarning as e: self.assertEqual(str(e), "Non-standard feature line wrapping (didn't break on comma)?") else: self.assertTrue(False, "Expected specified BiopythonParserWarning here.")
def load_samples(sequences): with open("data/genbank_sequences.gb") as handle: #Use biopython to parse the GenBank records for record in GenBank.parse(handle): #skip partial sequence records if ('partial' in record.definition): continue #For now id for a sample will include a truncated version of the country of origin, date collected #and the accession number of the record. accession = record.accession[0] source = findFeature(record, 'source') if source is not None: country = findItem(source, '/country=') col_date = findItem(source, '/collection_date=') id = accession if col_date is not None: dt = dateutil.parser.parse( col_date) # Time formatting is not consistent norm_date = dt.strftime(r'%Y-%m-%d') id = norm_date + '-' + id if country is not None: country = country.replace(':', ' ') id = country.split()[0][:7].strip() + '-' + id #For each CDS record genes = findAllFeature(record, 'CDS') for gene in genes: #First figure out the gene / protein name. Try both the product tag and gene tag #NOTE: We are not interested in the post translation non structural protein products. # We just process the orf1ab gene that has all of them embedded in it. product = findItem(gene, '/product=') if product is not None: gene_name = product.split()[0].upper() #A few proteins have aliases, map them to the standard form. if gene_name in geneAlias: gene_name = geneAlias[gene_name] if gene_name is None or gene_name not in validList: gene_name = findItem(gene, '/gene=') if (gene_name is not None and gene_name in geneAlias): gene_name = geneAlias[gene_name] if (gene_name not in validList): continue sequence = findItem(gene, '/translation=') if (id is not None and sequence is not None): if gene_name == 'ORF1AB': loadOrf1AB(sequences, id, sequence) else: sequences[gene_name].append(id + '|' + sequence)
def parse(path='./flat_files/'): path = Path(path) print "parsing records at {}".format(path.absolute()) records = [] for p in path.listdir(): try: gbr = GenBank.read(open(p)) records.append(gbr) except: print 'error with file', p print "parsed %s records.." % len(records) return records
def fetch(id: str): logging.info(f"Fetching GenBank id={id}") r = eutils.fetch(db='nuccore', id=id, rettype='gb') if r.ok: gb = GenBank.read(string=r.text) accession = gb.accession[0] filename = f"{accession}{'.' + config.taxon if config.taxon else ''}.gb" file_out = os.path.join(FileSystem.dir['genbank'], filename) record = GenBank.query.filter_by(version=gb.version).first() if record and os.path.exists(record.filepath): logging.info( f"{accession} already exists as file='{record.filepath}' and GenBank.id={record.id}" ) else: with open(file_out, 'w') as fh: fh.write(r.text) GenBank.add_file(file_out) print(f"[INFO] Fetched file: {file_out}") return file_out else: print(f"[WARN] Could not download '{id}'")
def BaseDraw(plasmids): ordN = 0 for (pName, seq_infile, pLen, offset) in plasmids: # set Y axis once and for all for the plasmid being processed y0 = (pNs - ordN) * dBL # starts from the top pLeni = int(pLen) print 'offset', offset offset = int(offset) # draw plasmid baseline BaseL(ordN, pName, pLeni, y0, canvas_main) # label the baseline with plasmid name and size LabeL(ordN, pName, pLeni, y0, canvas_main) # evaluate file name to detect format using Quixote [filename] format = Quixote(seq_infile) # mark up sequence origin if there is an offset if offset < -1 or offset > 1: Zs, dir = Off7(1, pLeni, offset) xs = Zs * u canvas_main.setFont(bFont, NfSize) canvas_main.drawString(xs, y0 + da / 2, osym) # filter and draw annotation features if format == 'genbank': # load GB file to filter features parser = GenBank.FeatureParser() fhandle = open(seq_infile, 'r') # load GenBank file gb_entry = parser.parse(fhandle) ORFcnt = 0 for feature in gb_entry.features: if feature.type == 'CDS' or feature.type == 'cds': # draw CDS using ORFeus ORFcnt += 1 ORFeus(feature, pLeni, offset, y0, ORFcnt) elif SFX == 'on': if feature.type == 'SNP': Snippit(feature, pLeni, offset, y0) # draw asterisk at feature location if feature.type == 'IR': IRFlag(feature, pLeni, offset, y0) # draw flag at feature location # need other functions for other features ( with conditional, default switch off) fhandle.close() print " got a GenBank-style file for " + pName + " with " + str( ORFcnt) + " ORFs" else: # no features so just skip this step print " got a non-genbank-style file for " + pName + "; no features to draw" # increment plasmid ordinal count ordN = ordN + 1 print " " + pName + " (" + str(pLeni) + " bp) drawn with " + str( ORFcnt) + " ORFs" print " OK"
def load_database(gb_handle): """Load a GenBank file into a BioSQL database. This is useful for running tests against a newly created database. """ create_database() # now open a connection to load the database db_name = "biosql-test" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db = server.new_database(db_name) # get the GenBank file we are going to put into it parser = GenBank.FeatureParser() iterator = GenBank.Iterator(gb_handle, parser) # finally put it in the database db.load(iterator) server.adaptor.conn.commit() server.adaptor.conn.close()
def fetch_gb(id, taxon=None): r = eutils.fetch(db='nuccore', id=id, rettype='gb') if r.ok: gb = GenBank.read(StringIO(r.text)) filename = f"{gb.locus}{'.' + taxon if taxon else ''}.gb" file_out = os.path.join(config.genbank_dir, filename) if os.path.exists(file_out): print(f"[WARN] {file_out} already exists") else: with open(file_out, 'w') as fh: fh.write(r.text) print(f"[INFO] Fetched file: {file_out}") else: print(f"[WARN] Could not download '{id}'")
def _get_organella(self, gb_file): """ Retrive the organelle from the genbank file, using the specific GenBank object, because SeqIO does not support this field """ organella = {} with open(gb_file, "r") as gbh: for record in GenBank.parse(gbh): accession = record.version for q in record.features[0].qualifiers: if q.key == "/organelle=": organelle = q.value.replace('"', '') organella[record.version] = organelle return organella
def parse_genebank_file(self): with open(self.gene_bank_file, "rU") as input_handle: for record in GenBank.parse(input_handle): #print("Name: %s, %i" % (record.name, len(record.features))) print(record.features) print(record.accession) print("----") print(record.gi) print("----") self.Acc = record.accession[0] if self.GI is None or len(self.GI) == 0: self.GI = "NA" if self.Acc is None or len(self.Acc) == 0: self.Acc = "NA"
def genbank_single(filename): """ >>> record = genbank_single("GFF/NC_001422.gbk") >>> record.taxonomy ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] >>> cds = record.features[-4] >>> cds.key 'CDS' >>> location = LocationFromString(cds.location) >>> print location 2931..3917 >>> subseq = record_subseq(record, location) >>> subseq[0:20] Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) """ return GenBank.RecordParser().parse(open(filename))
def add_file(cls, filepath: str): """TODO(seanbeagle): Create scraping tool for genbank data similar to get_source_data()""" try: gb = GenBank.read(file=filepath) accession = gb.accession[0] logging.debug(f"Adding {accession} to {cls}...") record = cls.insert( accession=accession, version=gb.version, filepath=filepath, # TODO: Ensure this is absolute filepath date_downloaded=now(), downloaded_by=getpass.getuser(), num_features=len(gb.features), length=len(gb)) return record except Exception as e: logging.debug(f"Could not insert GenBank record: {e}")
def test_topology_genbank(self): """Check GenBank topology parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("LOCUS U00096", None), ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear"), ] for (line, topo) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
def test_topology_embl(self): """Check EMBL topology parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear"), ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None), ("ID NRP_AX000635; PRT; NR1; 15 SQ", None), ("ID NRP0000016E; PRT; NR2; 5 SQ", None), ] for (line, topo) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
def get_record_list(): """ Parses through and generates a list of dictionaries containing all required information about records. Makes calls to other functions to format location and record data """ with open(INPUT_PATH) as handle: record_list = [] for gbk_record in GenBank.parse(handle): record_list.append({ "locus": gbk_record.locus, "features": [feature for feature in gbk_record.features] }) return format_record_list(record_list)
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT", None), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT", None), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV", None), ("LOCUS pEH010 5743 bp DNA circular", "circular", "DNA", None, [BiopythonParserWarning]), # This is a test of the format > 80 chars long ("LOCUS AZZZAA02123456789 1000000000 bp DNA linear PRI 15-OCT-2018", "linear", "DNA", "PRI", None) ] for (line, topo, mol_type, div, warning_list) in tests: with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line)) if warning_list is None: self.assertEqual(len(caught), 0) else: self.assertEqual(len(caught), len(warning_list)) for i, warning_class in enumerate(warning_list): self.assertEqual(caught[i].category, warning_class)
def test_topology_embl(self): """Check EMBL ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ # Modern examples with sequence version ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear", "mRNA", "PLN"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear", "genomic DNA", "MAM"), # Example to match GenBank example used above: ("ID U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.", "linear", "genomic DNA", "FUN"), # Old examples: ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular", "DNA", "PRO"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None, "DNA", "PRO"), # Patent example from 2016-06-10 # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/ ("ID A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.", "linear", "unassigned DNA", "MUS"), # Old patent examples ("ID NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"), ("ID NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"), # KIPO patent examples ("ID DI500001 STANDARD; PRT; 111 AA.", None, None, None), ("ID DI644510 standard; PRT; 1852 AA.", None, None, None), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
def search(self,start,end): ncbi_dict = GenBank.NCBIDictionary() j=1 if start<1: start =1 if end >len(self.accs) or end == 0: end = len(self.accs) for k in range(start-1,end): sys.stderr.write("No " + repr(j) + ": " + self.accs[k] +'\n') j=j+1 gi_list = GenBank.search_for(self.accs[k],database='protein') for i in range(0,len(gi_list)): try: gb_record = ncbi_dict[gi_list[i]] sys.stdout.write('>'+self.accs[k]+'\n') sys.stdout.write( gb_record) except: sys.stderr.write( self.accs[k] + " fetching error \n")
def test_first_line_imgt(self): """Check IMGT ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID HLA00001 standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ("ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner._ImgtScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
class RecordReceiver: def __init__(self, handle): self.handle = handle def __call__(self, id, rec): self.handle.write(rec) # Functor that deals with bad records - prints an error message to HANDLE class BadRecordReceiver: def __init__(self, handle): self.handle = handle self.badIDs = [] def __call__(self, badID): self.badIDs.append(badID) self.handle.write("Bad ID: %s\n" % badID) # Form pattern for accession strings accessionPat = prefix + "%%0%dd" % digits batchSize = 500 for curr in xrange(start, end + 1, batchSize): # Generate accession strings for this batch ids = [accessionPat % num for num in range(curr, min(curr + batchSize, end + 1))] GenBank.download_many( ids, RecordReceiver(sys.stdout), database="nucleotide", broken_fn=BadRecordReceiver(sys.stderr), faildelay=5.0 )
body = '\n'.join(textwrap.wrap(rec.seq.data, width=80)) return head, body if __name__ == '__main__': mode = sys.argv[1] text = sys.argv[2] output_file = sys.argv[3] print 'Searching for %s <br>' % text # check if inputs are all numbers try: gi_list = text.split() tmp = map(int, gi_list) except ValueError: gi_list = GenBank.search_for(text, max_ids=10) fp = open(output_file, 'wt') record_parser = GenBank.FeatureParser() ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser = record_parser) for gid in gi_list: res = ncbi_dict[gid] head, body = make_fasta(res) fp.write(head+body+'\n') print head fp.close()
def test_genbank_read(self): with open(path.join("GenBank", "NC_000932.gb")) as handle: record = GenBank.read(handle) self.assertEqual(['NC_000932'], record.accession)