Exemplos de Entrez em Python, exemplos de SAP.Bio.Entrez em Python

Exemplo n.º 1

0

Exibir arquivo

    def _retrieveSequences(self, speciesList):

        sequenceLists = {}
        if os.path.exists(self.options.database):
            # local database:
            db = Native.DB(self.options.database, self.options)
            for species in speciesList:
                for seqID in db.index[species]:
                    homologue, retrievalStat = db.get(seqID)
                    sequenceLists.setdefault(species, []).append(Fasta.Record(homologue.gi, homologue.sequence))
        else:
            # genbank
            for species in speciesList:

                Entrez.email = self.options.email

                handle = Entrez.esearch(db="nucleotide", retmax=10, term="%s[ORGN]" % species)
                # handle = Entrez.esearch(db="nucleotide", retmax=10, term="%s[ORGN] AND barcode[keyword]" % species)
                record = Entrez.read(handle)

                if record["Count"] < 5:
                    print "WARNING: only %d sequences representing %s" % (record["Count"], species)

                success = False

                for tries in range(10):
                    try:
                        handle = Entrez.efetch(db="nucleotide", id=','.join(record["IdList"]), rettype="fasta", retmax=10)
                        fastaIterator = Fasta.Iterator(handle, Fasta.RecordParser())
                        for entry in fastaIterator:
                            entry.sequence = re.sub('[^ATGC-]', 'N', entry.sequence)
                            entry.title = entry.title.split('|')[1]
                            sequenceLists.setdefault(species, []).append(entry)
                        success = True
                    except:
                        time.sleep(tries * 5)
                        continue
                    break

                if not success:
                    return None

        return sequenceLists

Exemplo n.º 2

0

Exibir arquivo

    def get(self, gi):
        """
        Look up genbank records by their GI
        """

        taxonomyFileName = os.path.join(self.options.dbcache, gi + ".tax")
        fastaFileName = os.path.join(self.options.dbcache, gi + ".fasta")

        if (os.path.exists(taxonomyFileName)
                and os.path.getsize(taxonomyFileName) != 0
                and os.path.exists(fastaFileName)
                and os.path.getsize(fastaFileName) != 0):
            retrievalStatus = "(c)"
            taxonomy = utils.safeReadTaxonomyCache(taxonomyFileName)
            sequence = utils.safeReadFastaCache(fastaFileName)
        else:
            retrievalStatus = "(d)"

            taxonXref = None
            seqLength = None

            successful = False
            for tries in range(10):
                try:
                    Entrez.email = self.options.email
                    Entrez.tool = 'sapwebserver'
                    fp = Entrez.efetch(db="nucleotide", id=gi, retmode="xml")

                    # Get the cross ref to the taxonomy database:
                    taxonXrefRE = re.compile(
                        "<GBQualifier_value>taxon:(\d+)</GBQualifier_value>")
                    seqLengthRE = re.compile(
                        "<GBSeq_length>(\d+)</GBSeq_length>")
                    sequenceRE = re.compile(
                        "<GBSeq_sequence>([a-zA-Z]+)</GBSeq_sequence>")

                    taxonXref = None
                    seqLength = None
                    sequence = None

                    while taxonXref is None or sequence is None:
                        line = fp.readline()
                        if not line:
                            break
                        taxonMatch = taxonXrefRE.search(line)
                        lengthMatch = seqLengthRE.search(line)
                        sequenceMatch = sequenceRE.search(line)
                        if taxonMatch:
                            if taxonXref is None:
                                taxonXref = taxonMatch.group(1)
#                             else:
#                                print "There was more than one taxon xref for %s. Picking the first one (%s)." % (gi, taxonXref)
                        if lengthMatch:
                            seqLength = lengthMatch.group(1)
                        if sequenceMatch:
                            sequence = sequenceMatch.group(1)

                    if not (taxonXref and sequence):
                        # Give it another try:
                        continue

                except KeyboardInterrupt:
                    sys.exit()
                except MemoryError:
                    # Write an empty file to cache to keep the script from
                    # trying to download the sequence next time.
                    utils.writeFile(fastaFileName, '')
                    return None, retrievalStatus.replace(")", "!M)")
                except:
                    ## print ' retrieving failed - retrying'
                    time.sleep(tries * 5)
                    continue
                else:
                    successful = True
                    fp.close()
                    break
                if not successful:
                    return None, retrievalStatus.replace(")", "!D2)")

            if not (taxonXref and gi and sequence):
                # The entry did not have a cross ref to the taxonomy database:
                return None, retrievalStatus.replace(")", "!T2)")

            # Make an object to hold the taxonomy:
            taxonomy = Taxonomy.Taxonomy()
            try:
                taxonomy.populateFromNCBI(
                    dbid=taxonXref,
                    #                                          allow_unclassified=self.options.unclassified,
                    minimaltaxonomy=self.options.minimaltaxonomy)
            except Taxonomy.NCBIPopulationError, X:
                return None, retrievalStatus.replace(")", " !%s)" % X.status)

            # Dump the taxonomy object to a file:
            fp = open(taxonomyFileName, 'w')
            pickle.dump(taxonomy, fp)
            fp.close()

            # Upcase the sequence:
            sequence = sequence.upper()

            # Cache the sequence:
            fastaEntry = ">%s\n%s\n" % (gi, sequence)
            utils.writeFile(fastaFileName, fastaEntry)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: GenBank.py Projeto: kaspermunch/sap

    def get(self, gi):
        """
        Look up genbank records by their GI
        """

        taxonomyFileName = os.path.join(self.options.dbcache, gi + ".tax")
        fastaFileName = os.path.join(self.options.dbcache, gi + ".fasta")

        if (os.path.exists(taxonomyFileName) and os.path.getsize(taxonomyFileName) != 0 and
            os.path.exists(fastaFileName) and os.path.getsize(fastaFileName) != 0):
            retrievalStatus = "(c)"
            taxonomy = utils.safeReadTaxonomyCache(taxonomyFileName)
            sequence = utils.safeReadFastaCache(fastaFileName)
        else:
            retrievalStatus = "(d)"

            taxonXref = None
            seqLength = None

            successful = False
            for tries in range(10):
                try:
                    Entrez.email = self.options.email
                    Entrez.tool = 'sapwebserver'
                    fp = Entrez.efetch(db="nucleotide", id=gi, retmode="xml")

                    # Get the cross ref to the taxonomy database:
                    taxonXrefRE = re.compile("<GBQualifier_value>taxon:(\d+)</GBQualifier_value>")
                    seqLengthRE = re.compile("<GBSeq_length>(\d+)</GBSeq_length>")
                    sequenceRE = re.compile("<GBSeq_sequence>([a-zA-Z]+)</GBSeq_sequence>")

                    taxonXref = None
                    seqLength = None
                    sequence = None

                    while taxonXref is None or sequence is None:
                        line = fp.readline()
                        if not line:
                            break
                        taxonMatch = taxonXrefRE.search(line)
                        lengthMatch = seqLengthRE.search(line)
                        sequenceMatch = sequenceRE.search(line)
                        if taxonMatch:
                            if taxonXref is None:
                               taxonXref = taxonMatch.group(1)
#                             else:
#                                print "There was more than one taxon xref for %s. Picking the first one (%s)." % (gi, taxonXref)
                        if lengthMatch:
                            seqLength = lengthMatch.group(1)
                        if sequenceMatch:
                            sequence = sequenceMatch.group(1)

                    if not (taxonXref and sequence):
                       # Give it another try:
                       continue

                except KeyboardInterrupt:
                   sys.exit()
                except MemoryError:
                    # Write an empty file to cache to keep the script from
                    # trying to download the sequence next time.
                    utils.writeFile(fastaFileName, '')
                    return None, retrievalStatus.replace(")", "!M)")
                except:
                   ## print ' retrieving failed - retrying'
                   time.sleep(tries * 5)
                   continue
                else:
                   successful = True
                   fp.close()
                   break
                if not successful:
                   return None, retrievalStatus.replace(")", "!D2)")

            if not (taxonXref and gi and sequence):
                # The entry did not have a cross ref to the taxonomy database:
                return None, retrievalStatus.replace(")", "!T2)")

            # Make an object to hold the taxonomy:
            taxonomy = Taxonomy.Taxonomy()
            try:
               taxonomy.populateFromNCBI(dbid=taxonXref,
#                                          allow_unclassified=self.options.unclassified,
                                         minimaltaxonomy=self.options.minimaltaxonomy)
            except Taxonomy.NCBIPopulationError, X:
               return None, retrievalStatus.replace(")", " !%s)" % X.status)
               
            # Dump the taxonomy object to a file:
            fp = open(taxonomyFileName, 'w')
            pickle.dump(taxonomy, fp)
            fp.close()

            # Upcase the sequence:
            sequence = sequence.upper()

            # Cache the sequence:
            fastaEntry = ">%s\n%s\n" % (gi, sequence)
            utils.writeFile(fastaFileName, fastaEntry)