def _retrieveSequences(self, speciesList): sequenceLists = {} if os.path.exists(self.options.database): # local database: db = Native.DB(self.options.database, self.options) for species in speciesList: for seqID in db.index[species]: homologue, retrievalStat = db.get(seqID) sequenceLists.setdefault(species, []).append(Fasta.Record(homologue.gi, homologue.sequence)) else: # genbank for species in speciesList: Entrez.email = self.options.email handle = Entrez.esearch(db="nucleotide", retmax=10, term="%s[ORGN]" % species) # handle = Entrez.esearch(db="nucleotide", retmax=10, term="%s[ORGN] AND barcode[keyword]" % species) record = Entrez.read(handle) if record["Count"] < 5: print "WARNING: only %d sequences representing %s" % (record["Count"], species) success = False for tries in range(10): try: handle = Entrez.efetch(db="nucleotide", id=','.join(record["IdList"]), rettype="fasta", retmax=10) fastaIterator = Fasta.Iterator(handle, Fasta.RecordParser()) for entry in fastaIterator: entry.sequence = re.sub('[^ATGC-]', 'N', entry.sequence) entry.title = entry.title.split('|')[1] sequenceLists.setdefault(species, []).append(entry) success = True except: time.sleep(tries * 5) continue break if not success: return None return sequenceLists
def get(self, gi): """ Look up genbank records by their GI """ taxonomyFileName = os.path.join(self.options.dbcache, gi + ".tax") fastaFileName = os.path.join(self.options.dbcache, gi + ".fasta") if (os.path.exists(taxonomyFileName) and os.path.getsize(taxonomyFileName) != 0 and os.path.exists(fastaFileName) and os.path.getsize(fastaFileName) != 0): retrievalStatus = "(c)" taxonomy = utils.safeReadTaxonomyCache(taxonomyFileName) sequence = utils.safeReadFastaCache(fastaFileName) else: retrievalStatus = "(d)" taxonXref = None seqLength = None successful = False for tries in range(10): try: Entrez.email = self.options.email Entrez.tool = 'sapwebserver' fp = Entrez.efetch(db="nucleotide", id=gi, retmode="xml") # Get the cross ref to the taxonomy database: taxonXrefRE = re.compile( "<GBQualifier_value>taxon:(\d+)</GBQualifier_value>") seqLengthRE = re.compile( "<GBSeq_length>(\d+)</GBSeq_length>") sequenceRE = re.compile( "<GBSeq_sequence>([a-zA-Z]+)</GBSeq_sequence>") taxonXref = None seqLength = None sequence = None while taxonXref is None or sequence is None: line = fp.readline() if not line: break taxonMatch = taxonXrefRE.search(line) lengthMatch = seqLengthRE.search(line) sequenceMatch = sequenceRE.search(line) if taxonMatch: if taxonXref is None: taxonXref = taxonMatch.group(1) # else: # print "There was more than one taxon xref for %s. Picking the first one (%s)." % (gi, taxonXref) if lengthMatch: seqLength = lengthMatch.group(1) if sequenceMatch: sequence = sequenceMatch.group(1) if not (taxonXref and sequence): # Give it another try: continue except KeyboardInterrupt: sys.exit() except MemoryError: # Write an empty file to cache to keep the script from # trying to download the sequence next time. utils.writeFile(fastaFileName, '') return None, retrievalStatus.replace(")", "!M)") except: ## print ' retrieving failed - retrying' time.sleep(tries * 5) continue else: successful = True fp.close() break if not successful: return None, retrievalStatus.replace(")", "!D2)") if not (taxonXref and gi and sequence): # The entry did not have a cross ref to the taxonomy database: return None, retrievalStatus.replace(")", "!T2)") # Make an object to hold the taxonomy: taxonomy = Taxonomy.Taxonomy() try: taxonomy.populateFromNCBI( dbid=taxonXref, # allow_unclassified=self.options.unclassified, minimaltaxonomy=self.options.minimaltaxonomy) except Taxonomy.NCBIPopulationError, X: return None, retrievalStatus.replace(")", " !%s)" % X.status) # Dump the taxonomy object to a file: fp = open(taxonomyFileName, 'w') pickle.dump(taxonomy, fp) fp.close() # Upcase the sequence: sequence = sequence.upper() # Cache the sequence: fastaEntry = ">%s\n%s\n" % (gi, sequence) utils.writeFile(fastaFileName, fastaEntry)
def get(self, gi): """ Look up genbank records by their GI """ taxonomyFileName = os.path.join(self.options.dbcache, gi + ".tax") fastaFileName = os.path.join(self.options.dbcache, gi + ".fasta") if (os.path.exists(taxonomyFileName) and os.path.getsize(taxonomyFileName) != 0 and os.path.exists(fastaFileName) and os.path.getsize(fastaFileName) != 0): retrievalStatus = "(c)" taxonomy = utils.safeReadTaxonomyCache(taxonomyFileName) sequence = utils.safeReadFastaCache(fastaFileName) else: retrievalStatus = "(d)" taxonXref = None seqLength = None successful = False for tries in range(10): try: Entrez.email = self.options.email Entrez.tool = 'sapwebserver' fp = Entrez.efetch(db="nucleotide", id=gi, retmode="xml") # Get the cross ref to the taxonomy database: taxonXrefRE = re.compile("<GBQualifier_value>taxon:(\d+)</GBQualifier_value>") seqLengthRE = re.compile("<GBSeq_length>(\d+)</GBSeq_length>") sequenceRE = re.compile("<GBSeq_sequence>([a-zA-Z]+)</GBSeq_sequence>") taxonXref = None seqLength = None sequence = None while taxonXref is None or sequence is None: line = fp.readline() if not line: break taxonMatch = taxonXrefRE.search(line) lengthMatch = seqLengthRE.search(line) sequenceMatch = sequenceRE.search(line) if taxonMatch: if taxonXref is None: taxonXref = taxonMatch.group(1) # else: # print "There was more than one taxon xref for %s. Picking the first one (%s)." % (gi, taxonXref) if lengthMatch: seqLength = lengthMatch.group(1) if sequenceMatch: sequence = sequenceMatch.group(1) if not (taxonXref and sequence): # Give it another try: continue except KeyboardInterrupt: sys.exit() except MemoryError: # Write an empty file to cache to keep the script from # trying to download the sequence next time. utils.writeFile(fastaFileName, '') return None, retrievalStatus.replace(")", "!M)") except: ## print ' retrieving failed - retrying' time.sleep(tries * 5) continue else: successful = True fp.close() break if not successful: return None, retrievalStatus.replace(")", "!D2)") if not (taxonXref and gi and sequence): # The entry did not have a cross ref to the taxonomy database: return None, retrievalStatus.replace(")", "!T2)") # Make an object to hold the taxonomy: taxonomy = Taxonomy.Taxonomy() try: taxonomy.populateFromNCBI(dbid=taxonXref, # allow_unclassified=self.options.unclassified, minimaltaxonomy=self.options.minimaltaxonomy) except Taxonomy.NCBIPopulationError, X: return None, retrievalStatus.replace(")", " !%s)" % X.status) # Dump the taxonomy object to a file: fp = open(taxonomyFileName, 'w') pickle.dump(taxonomy, fp) fp.close() # Upcase the sequence: sequence = sequence.upper() # Cache the sequence: fastaEntry = ">%s\n%s\n" % (gi, sequence) utils.writeFile(fastaFileName, fastaEntry)