Пример #1
0
def download_fasta_and_genbank(identifier, tag, genbank=True, fasta=True):
    """

    :param identifier: valid identifier to retrieve from NCBI (genbank) and
        ENA (fasta)
    :param tag: name of the filename for the genbank and fasta files.
    """
    if genbank:
        from bioservices import EUtils
        eu = EUtils()
        data = eu.EFetch(db="nuccore",
                         id=identifier,
                         rettype="gbwithparts",
                         retmode="text")
        if isinstance(data, int) and data == 400:
            raise ValueError("{} not found on NCBI".format(identifier))
        else:
            with open("%s.gbk" % tag, "w") as fout:
                fout.write(data.decode())

    if fasta:
        from bioservices import ENA
        ena = ENA()
        data = ena.get_data(identifier, 'fasta')
        if isinstance(data, int) and data == 400:
            raise ValueError("{} not found on ENA".format(identifier))
        else:
            with open("%s.fa" % tag, "w") as fout:
                try:
                    # change in API in v1.7.8
                    fout.write(data)
                except:
                    fout.write(data.decode())
Пример #2
0
def findName(accession):
    s = EUtils()
    theID = None
    geneOrProtein = "gene"
    # Check gene database
    res = s.ESearch("gene", accession)
    if len(res["idlist"]) > 0:
        # Get the ID in the "gene" database
        theID = res["idlist"][0]
    # If that fails, check the protein database
    if theID == None:
        res = s.ESearch("protein", accession)
        if len(res["idlist"]) > 0:
            theID = res["idlist"][0]
            geneOrProtein = "protein"
    # Couldn't find in either database
    if not theID:
        print("ERROR: couldn't find link for %s" % accession)

    # Get link to the corresponding ID in the Taxonomy database
    link = s.ELink(db="taxonomy",
                   dbfrom=geneOrProtein,
                   id=theID,
                   retmode="json")
    taxID = json.loads(link)["linksets"][0]["linksetdbs"][0]["links"][0]

    # Download taxonomy record
    tax = s.EFetch(db="taxonomy", id=taxID)
    #print(tax)
    tree = ElementTree.fromstring(tax)
    xmlP = parseString(tax)
    #print(xmlP.toprettyxml())
    taxonTag = tree.find("Taxon")
    sciName = taxonTag.find("ScientificName")
    return sciName.text
Пример #3
0
def get_entrez_summary(gene_id):
    e = EUtils()
    ret = e.ESearch('gene', gene_id)
    ret = e.EFetch('gene', ret['idlist'])
    return ret.decode("utf-8")
Пример #4
0
class EUtilsTools(object):
    """Simple wrapper around EUtils to fetch basic informatino about an accession number


    ::

        >>> from sequana.databases import EUtilsTools
        >>> et.accession_to_info("K01711.1")
        {'K01711.1': {'accession': '331784',
          'comment': 'Measles virus (strain Edmonston), complete genome',
          'gi': '331784',
          'identifier': 'gi|331784|gb|K01711.1|MEANPCG[331784]',
          'taxid': '11234'}}


    """
    def __init__(self):
        from bioservices import EUtils
        self.eutils = EUtils()

    def accession_to_info(self, ids):
        """An accession or list of them returns list of dictionaries"""
        res = self.eutils.EFetch(db="nuccore",
                                 id=ids,
                                 rettype="docsum",
                                 retmode="dict")

        res = res['eSummaryResult']['DocSum']

        # if one id provided, it will be a dict, otherwise a list of dicts
        try:
            res[0]
        except:
            res = [res]

        # now we can loop over all identifiers
        records = {}
        accessions = [x.strip() for x in ids.split(',')]

        for i, entry in enumerate(res):
            # first, save the acc number
            accession = entry['Id']
            # then various info
            items = entry['Item']
            identifier = [x for x in items
                          if x['@Name'] == "Extra"][0]['#text']
            if "||" in identifier:
                # strip content after ||
                identifier = identifier.split("||")[0]

            title = [x for x in items if x['@Name'] == "Title"][0]['#text']
            taxid = [x for x in items if x['@Name'] == "TaxId"][0]['#text']
            gi = [x for x in items if x['@Name'] == "Gi"][0]['#text']
            record = {
                "taxid": taxid,
                'accession': accession,
                "identifier": identifier,
                'gi': gi,
                'comment': title
            }

            records[accessions[i]] = AttrDict(**record)
        return records
Пример #5
0
class KrakenBuilderBase():
    def __init__(self, dbname):
        from bioservices import EUtils
        self.dbname = dbname
        self.eutils = EUtils()
        self.enadb = ENADownload()

        self.category = [
            'archaea', 'bacteria', 'fungi', 'invertebrate', 'mitonchondrion',
            'other', 'plant', 'plasmid', 'plastid', 'protozoa',
            'vertebrate_mammalian', 'vertebrate_other', 'viral'
        ]

    def download_ncbi_refseq(self, category):
        """Download all files of type *fna* from ncbi FTP.

        ::

            kb = KrakenBuilder()
            kb.download_ncbi_refseq("viral")

        """
        import ftplib
        import os
        assert category in self.category, "Please use one of {}".format(
            self.category)

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", "*****@*****.**")
        ftp.cwd("refseq/release/{}".format(category))

        import io
        file_mapper = {}
        for filename in ftp.nlst():
            if "genomic.fna" in filename:
                ftp.retrbinary('RETR ' + filename, open(filename, "wb").write)
                print(filename)

    def download_genomes_from_ncbi(self, email, category):
        """This downloads all genomes on ncbi for a given category looking at
        their ftp. This could be highly redundant.


        """
        assert category in self.category

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", email)
        ftp.cwd("refseq/release/{}".format(category))

        file_mapper = {}
        for filename in ftp.nlst():
            if "genomic.fna" in filename:
                ftp.retrbinary('RETR ' + filename, open(filename, "wb").write)
                logger.info(filename)

    def _download_assembly_report(self, category):
        assert category in self.category

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", "anonymous")
        ftp.cwd("genomes/refseq/{}".format(category))

        filename = "assembly_summary.txt"
        ftp.retrbinary(
            'RETR ' + filename,
            open(filename.replace(".txt", "_{}.txt".format(category)),
                 "wb").write)
        logger.info(filename)

    def download_accession_from_ncbi(self, accession):
        # a list of accessions in a file
        # can be a list, a unique string, a filename with 1-column wit accession
        # to retrieve
        if isinstance(accession, list):
            pass
        elif isinstance(accession, str):
            if os.path.exists(accession):
                with open(accession, "r") as fin:
                    accessions = fin.read().split()
            else:
                accessions = [accession]

        from easydev import Progress
        N = len(accessions)
        pb = Progress(N)
        logger.info("Fetching {} accession fasta files from NCBI".format(N))
        for i, accession in enumerate(accessions):
            data = self.eutils.EFetch("nucleotide",
                                      rettype="fasta",
                                      id=accession,
                                      retmode="text")
            if isinstance(data, int):
                logger.info(
                    "Could not fetch this accession: {}. continue".format(
                        accession))
                print("Could not fetch this accession: {}. continue".format(
                    accession))
            else:
                outname = "{}/library/{}.fa".format(self.dbname, accession)
                with open(outname, "wb") as fout:
                    fout.write(data)
            pb.animate(i + 1)