示例#1
0
def download_fasta_and_genbank(identifier, tag, genbank=True, fasta=True):
    """

    :param identifier: valid identifier to retrieve from NCBI (genbank) and
        ENA (fasta)
    :param tag: name of the filename for the genbank and fasta files.
    """
    if genbank:
        from bioservices import EUtils
        eu = EUtils()
        data = eu.EFetch(db="nuccore",
                         id=identifier,
                         rettype="gbwithparts",
                         retmode="text")
        if isinstance(data, int) and data == 400:
            raise ValueError("{} not found on NCBI".format(identifier))
        else:
            with open("%s.gbk" % tag, "w") as fout:
                fout.write(data.decode())

    if fasta:
        from bioservices import ENA
        ena = ENA()
        data = ena.get_data(identifier, 'fasta')
        if isinstance(data, int) and data == 400:
            raise ValueError("{} not found on ENA".format(identifier))
        else:
            with open("%s.fa" % tag, "w") as fout:
                try:
                    # change in API in v1.7.8
                    fout.write(data)
                except:
                    fout.write(data.decode())
示例#2
0
    def __init__(self, dbname):
        self.dbname = dbname
        self.eutils = EUtils()
        self.enadb = ENADownload()

        self.category = [
            'archaea', 'bacteria', 'fungi', 'invertebrate', 'mitonchondrion',
            'other', 'plant', 'plasmid', 'plastid', 'protozoa',
            'vertebrate_mammalian', 'vertebrate_other', 'viral'
        ]
示例#3
0
 def find_taxon(self, taxid, mode="ncbi"):
     taxid = str(taxid)
     if mode == "ncbi":
         from bioservices import EUtils
         self.eutils = EUtils(verbose=False)
         res = self.eutils.taxonomy_summary(taxid)
     else:
         res = self.ensembl.get_taxonomy_by_id(taxid)
     return res
     """if "error" in res[taxid]:
示例#4
0
    def to_genbank(self, retmax=10000):
        """Draft: from a TaxID, uses EUtils to retrieve
        the GenBank identifiers

        :Inspiration: https://gist.github.com/fjossinet/5673672
        """
        from bioservices import EUtils
        e = EUtils()
        idlist = e.ESearch(db='nucleotide',
                           term='txid%s[Organism:exp]' % self.taxid,
                           restart=0,
                           retmax=retmax)['idlist']
        results = e.ESummary(db='nucleotide', id=idlist, retmax=retmax)
        return results
示例#5
0
 def _update_custom_taxonomy_bases(self, taxid):
     """
     """
     taxid = str(taxid)
     self.eutils = EUtils(verbose=False)
     res = self.eutils.taxonomy_summary(taxid)
     if "error" in res[taxid]:
         print("not found in NCBI (EUtils)")
     else:
         print("found in NCBI (EUtils) and added to local databases")
         with open(self.custom_db, "w") as fout:
             data = res[taxid]
             fout.write("ID : {}\n".format(taxid))
             #fout.write("PARENT ID : {}\n".format(taxid))
             fout.write("RANK : {}\n".format(data['rank']))
             #fout.write("GC ID : {}\n".format(data['']))
             fout.write("SCIENTIFIC NAME : {}\n".format(data['scientificname']))
示例#6
0
def findName(accession):
    s = EUtils()
    theID = None
    geneOrProtein = "gene"
    # Check gene database
    res = s.ESearch("gene", accession)
    if len(res["idlist"]) > 0:
        # Get the ID in the "gene" database
        theID = res["idlist"][0]
    # If that fails, check the protein database
    if theID == None:
        res = s.ESearch("protein", accession)
        if len(res["idlist"]) > 0:
            theID = res["idlist"][0]
            geneOrProtein = "protein"
    # Couldn't find in either database
    if not theID:
        print("ERROR: couldn't find link for %s" % accession)

    # Get link to the corresponding ID in the Taxonomy database
    link = s.ELink(db="taxonomy",
                   dbfrom=geneOrProtein,
                   id=theID,
                   retmode="json")
    taxID = json.loads(link)["linksets"][0]["linksetdbs"][0]["links"][0]

    # Download taxonomy record
    tax = s.EFetch(db="taxonomy", id=taxID)
    #print(tax)
    tree = ElementTree.fromstring(tax)
    xmlP = parseString(tax)
    #print(xmlP.toprettyxml())
    taxonTag = tree.find("Taxon")
    sciName = taxonTag.find("ScientificName")
    return sciName.text
示例#7
0
def get_entrez_summary(gene_id):
    e = EUtils()
    ret = e.ESearch('gene', gene_id)
    ret = e.EFetch('gene', ret['idlist'])
    return ret.decode("utf-8")
示例#8
0
 def __init__(self):
     from bioservices import EUtils
     self.eutils = EUtils()
示例#9
0
class EUtilsTools(object):
    """Simple wrapper around EUtils to fetch basic informatino about an accession number


    ::

        >>> from sequana.databases import EUtilsTools
        >>> et.accession_to_info("K01711.1")
        {'K01711.1': {'accession': '331784',
          'comment': 'Measles virus (strain Edmonston), complete genome',
          'gi': '331784',
          'identifier': 'gi|331784|gb|K01711.1|MEANPCG[331784]',
          'taxid': '11234'}}


    """
    def __init__(self):
        from bioservices import EUtils
        self.eutils = EUtils()

    def accession_to_info(self, ids):
        """An accession or list of them returns list of dictionaries"""
        res = self.eutils.EFetch(db="nuccore",
                                 id=ids,
                                 rettype="docsum",
                                 retmode="dict")

        res = res['eSummaryResult']['DocSum']

        # if one id provided, it will be a dict, otherwise a list of dicts
        try:
            res[0]
        except:
            res = [res]

        # now we can loop over all identifiers
        records = {}
        accessions = [x.strip() for x in ids.split(',')]

        for i, entry in enumerate(res):
            # first, save the acc number
            accession = entry['Id']
            # then various info
            items = entry['Item']
            identifier = [x for x in items
                          if x['@Name'] == "Extra"][0]['#text']
            if "||" in identifier:
                # strip content after ||
                identifier = identifier.split("||")[0]

            title = [x for x in items if x['@Name'] == "Title"][0]['#text']
            taxid = [x for x in items if x['@Name'] == "TaxId"][0]['#text']
            gi = [x for x in items if x['@Name'] == "Gi"][0]['#text']
            record = {
                "taxid": taxid,
                'accession': accession,
                "identifier": identifier,
                'gi': gi,
                'comment': title
            }

            records[accessions[i]] = AttrDict(**record)
        return records
示例#10
0
class Taxon(object):
    """Utility to search for information related to a taxon

    Uses HGNC service to fetch information about a taxon.
    ::

        >>> from bioservices.apps.taxonomy import Taxon
        >>> t = Taxon()
        >>> t.search_by_taxon("9606")
        {'Scientific Name': 'H**o sapiens', 'taxon': '9606'}

    You can also pop up the Uniprot page using::

        t.uniprot_onweb("9606")


    A full list of taxons is available here::

        http://www.uniprot.org/taxonomy/?query=*&format=*


    .. versionadded:: 1.2.0
    """

    def __init__(self):
        super(Taxon, self).__init__()
        # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"])
        self._eutils_service = EUtils()
        self._ensembl_service = Ensembl()  # there is a search by name, easier to use than EUtils

    def search_by_name(self, name):
        """using ensembl, tries to get the taxon identifier from the given  name

        ::

            >>> s.search_by_name('mouse')
            10090

        """
        res = self._ensembl_service.get_taxonomy_name("mouse")[0]
        try:
            return res["id"]
        except:
            return res

    def search_by_taxon(self, taxon):
        """
        should be a string without comma (only one entry accepted")
        """
        assert isinstance(taxon, str)
        assert "," not in taxon
        ret = self._eutils_service.taxonomy(taxon)
        if ret == "\n":
            # nothing found
            pass
        else:
            res = {"taxon": taxon, "Scientific Name": ret.Taxon[0].ScientificName}
            # self.df.append(res)
            return res

    def info(self, taxon, lineage=False):
        """Prints information about a Taxon

        :param str taxon: taxon identifier
        :param bool lineage: prints lineage is set to True
        """
        ret = self._eutils_service.taxonomy(taxon)
        print("Display Name: %s" % ret.Taxon[0].OtherNames.Name.DispName)
        print("GenBank Common name: %s" % ret.Taxon[0].OtherNames.GenbankCommonName)
        print("Taxon Id: %s " % ret.Taxon[0].TaxId)
        if lineage:
            print("Lineage:")
            for i, x in enumerate(ret.Taxon[0].Lineage.split(";")):
                print(i * " " + x)

    def uniprot_onweb(self, taxon):
        """Open Uniprot taxonomy page for a given taxon

        :param str taxon: taxon identifier
        """
        import webbrowser

        try:
            from urllib.request import urlopen
            from urllib.error import HTTPError, URLError
        except:
            from urllib2 import urlopen, HTTPError, URLError
        try:
            urlopen("http://www.uniprot.org/taxonomy/%s" % taxon)
            webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon)
        except HTTPError as err:
            print("Invalid taxon")
        except URLError as err:
            print(err.args)
示例#11
0
 def __init__(self):
     super(Taxon, self).__init__()
     # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"])
     self._eutils_service = EUtils()
     self._ensembl_service = Ensembl()  # there is a search by name, easier to use than EUtils
示例#12
0
class Taxon(object):
    """Utility to search for information related to a taxon

    Uses HGNC service to fetch information about a taxon.
    ::

        >>> from bioservices.apps.taxonomy import Taxon
        >>> t = Taxon()
        >>> t.search_by_taxon("9606")
        {'Scientific Name': 'H**o sapiens', 'taxon': '9606'}

    You can also pop up the Uniprot page using::

        t.uniprot_onweb("9606")


    A full list of taxons is available here::

        http://www.uniprot.org/taxonomy/?query=*&format=*


    .. versionadded:: 1.2.0
    """
    def __init__(self):
        super(Taxon, self).__init__()
        # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"])
        self._eutils_service = EUtils()
        self._ensembl_service = Ensembl() # there is a search by name, easier to use than EUtils


    def search_by_name(self, name):
        """using ensembl, tries to get the taxon identifier from the given  name

        ::

            >>> s.search_by_name('mouse')
            10090

        """
        res = self._ensembl_service.get_taxonomy_name("mouse")[0]
        try:
            return res['id']
        except:
            return res

    def search_by_taxon(self, taxon):
        """
        should be a string without comma (only one entry accepted")
        """
        assert isinstance(taxon, str)
        assert "," not in taxon
        ret = self._eutils_service.taxonomy(taxon)
        if ret == "\n":
            # nothing found
            pass
        else:
            res = {'taxon': taxon, 'Scientific Name': ret.Taxon[0].ScientificName}
            # self.df.append(res)
            return res

    def info(self, taxon, lineage=False):
        """Prints information about a Taxon

        :param str taxon: taxon identifier
        :param bool lineage: prints lineage is set to True
        """
        ret = self._eutils_service.taxonomy(taxon)
        print("Display Name: %s" % ret.Taxon[0].OtherNames.Name.DispName)
        print("GenBank Common name: %s" % ret.Taxon[0].OtherNames.GenbankCommonName)
        print("Taxon Id: %s " % ret.Taxon[0].TaxId)
        if lineage:
            print("Lineage:")
            for i, x in enumerate(ret.Taxon[0].Lineage.split(";")):
                print(i*" "+x)

    def uniprot_onweb(self, taxon):
        """Open Uniprot taxonomy page for a given taxon

        :param str taxon: taxon identifier
        """
        import webbrowser
        try:
            from urllib.request import urlopen
            from urllib.error import HTTPError, URLError
        except:
            from urllib2 import urlopen, HTTPError, URLError
        try:
            urlopen('http://www.uniprot.org/taxonomy/%s' % taxon)
            webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon)
        except HTTPError as err:
            print("Invalid taxon")
        except URLError as err:
            print(err.args)
示例#13
0
class Taxonomy(object):
    """This class should ease the retrieval and manipulation of Taxons

    There are many resources to retrieve information about a Taxon.
    For instance, from BioServices, one can use UniProt, Ensembl, or
    EUtils. This is convenient to retrieve a Taxon (see :meth:`fetch_by_name`
    and :meth:`fetch_by_id` that rely on Ensembl). However, you can
    also download a flat file from EBI ftp server, which
    stores a set or records (2.8M (april 2020).

    Note that the Ensembl database does not seem to be as up to date
    as the flat files but entries contain more information.

    for instance taxon 2 is in the flat file but not available through
    the :meth:`fetch_by_id`, which uses ensembl.

    So, you may access to a taxon in 2 different ways getting differnt
    dictionary. However, 3 keys are common (id, parent, scientific_name)

    ::

        >>> t = taxonomy.Taxonomy()
        >>> t.fetch_by_id(9606)   # Get a dictionary from Ensembl
        >>> t.records[9606] # or just try with the get
        >>> t[9606]
        >>> t.get_lineage(9606)

    """
    def __init__(self,
                 filename=None,
                 verbose=True,
                 online=True,
                 source="ncbi"):
        """.. rubric:: constructor

        :param offline: if you do not have internet, the connction to Ensembl
            may hang for a while and fail. If so, set **offline** to True
        :param from: download taxonomy databases from ncbi

        """
        assert source in ['ncbi', 'ena']
        self.source = source

        if online:
            from bioservices import Ensembl, EUtils
            self.ensembl = Ensembl(verbose=False)

        self.records = {}  # empty to start with.
        self.verbose = verbose

        if filename is None:
            self._dbname = "taxonomy.dat"
            self.database = sequana_config_path + os.sep + self._dbname
        else:
            self.database = filename

        self._custom_db = sequana_config_path
        self._custom_db += "/taxonomy/taxonomy_custom.dat"

    def _update_custom_taxonomy_bases(self, taxid):
        """
        """
        taxid = str(taxid)
        self.eutils = EUtils(verbose=False)
        res = self.eutils.taxonomy_summary(taxid)
        if "error" in res[taxid]:
            print("not found in NCBI (EUtils)")
        else:
            print("found in NCBI (EUtils) and added to local databases")
            with open(self.custom_db, "w") as fout:
                data = res[taxid]
                fout.write("ID : {}\n".format(taxid))
                #fout.write("PARENT ID : {}\n".format(taxid))
                fout.write("RANK : {}\n".format(data['rank']))
                #fout.write("GC ID : {}\n".format(data['']))
                fout.write("SCIENTIFIC NAME : {}\n".format(
                    data['scientificname']))

    def download_taxonomic_file(self, overwrite=False):
        """Loads entire flat file from EBI

        Do not overwrite the file by default.
        """
        import ftplib
        from sequana import sequana_config_path
        if os.path.exists(self.database) and overwrite is False:
            logger.info(
                "Found taxonomy.dat file in sequana your path {}".format(
                    sequana_config_path))
            return
        else:
            logger.info(
                "Downloading and extracting the taxonomy file from the web. Please be patient."
            )

        if self.source == "ena":
            url = 'ftp.ebi.ac.uk'
        else:
            url = 'ftp.ncbi.nlm.nih.gov'

        self.ftp = ftplib.FTP(url)
        self.ftp.login()
        if self.source == "ena":
            # for the EBI ftp only: self.ftp.cwd('databases')
            self.ftp.cwd('pub')
            self.ftp.cwd('databases')
            self.ftp.cwd('taxonomy')
            logger.warning(
                'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy'
                % self.database)
            self.ftp.retrbinary('RETR taxonomy.dat',
                                open(self.database, 'wb').write)
            ftp.close()
        else:
            self.ftp.cwd('pub')
            self.ftp.cwd('taxonomy')
            logger.warning('Downloading and saving in %s from ncbi ftp' %
                           self.database)
            import tempfile
            import shutil
            with tempfile.TemporaryDirectory() as tmpdir:
                filename = tmpdir + os.sep + "taxdump.tar.gz"
                self.ftp.retrbinary('RETR taxdump.tar.gz',
                                    open(filename, "wb").write)
                import tarfile
                tf = tarfile.open(filename)
                assert "nodes.dmp" in tf.getnames()
                assert "names.dmp" in tf.getnames()
                tf.extract("nodes.dmp", tmpdir)
                tf.extract("names.dmp", tmpdir)
                ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp",
                                    tmpdir + os.sep + "nodes.dmp")
                ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat")
                shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database)
            self.ftp.close()

    def load_records(self, overwrite=False):
        """Load a flat file and store records in :attr:`records`

        Since version 0.8.3 we use NCBI that is updated more often than the ebi
        ftp according to their README.

        ftp://ncbi.nlm.nih.gov/pub/taxonomy/

        """
        self.download_taxonomic_file(overwrite=overwrite)
        self.records = {}

        # TODO: check if it exists otherwise, load it ?
        if os.path.exists(self.database) is False:
            self.load()

        with open(self.database) as f:
            data = f.read().strip()

        # This is fast. tried parse package, much slower. cost of progress bar
        # is not important.
        data = data.split("//\n")  # the sep is //\n
        self._child_match = re.compile(r'ID\s+\:\s*(\d+)\s*')
        self._parent_match = re.compile(r'PARENT ID\s+\:\s*(\d+)\s*')
        self._rank_match = re.compile(r'RANK\s+\:\s*([^\n]+)\s*')
        self._name_match = re.compile(r'SCIENTIFIC NAME\s+\:\s*([^\n]+)\s*')

        from easydev import Progress
        pb = Progress(len(data))

        logger.info('Loading all taxon records.')
        for i, record in enumerate(data[0:]):
            dd = {'raw': record}
            dd['id'] = int(self._child_match.search(record).group(1))
            dd['parent'] = int(self._parent_match.search(record).group(1))
            dd['scientific_name'] = self._name_match.search(record).group(1)
            dd['rank'] = self._rank_match.search(record).group(1)
            self.records[dd["id"]] = dd
            if self.verbose:
                pb.animate(i + 1)
        if self.verbose:
            print()

    def find_taxon(self, taxid, mode="ncbi"):
        taxid = str(taxid)
        if mode == "ncbi":
            from bioservices import EUtils
            self.eutils = EUtils(verbose=False)
            res = self.eutils.taxonomy_summary(taxid)
        else:
            res = self.ensembl.get_taxonomy_by_id(taxid)
        return res
        """if "error" in res[taxid]:
            print("not found in NCBI (EUtils)")
        else:
            data = res[taxid]
            fout.write("ID : {}\n".format(taxid))
            #fout.write("PARENT ID : {}\n".format(taxid))
            fout.write("RANK : {}\n".format(data['rank']))
            #fout.write("GC ID : {}\n".format(data['']))
            fout.write("SCIENTIFIC NAME : {}\n".format(data['scientificname']))
        """

    @load_taxons
    def fetch_by_id(self, taxon):
        """Search for a taxon by identifier

        :return; a dictionary.

        ::

            >>> ret = s.search_by_id('10090')
            >>> ret['name']
            'Mus Musculus'

        """
        res = self.ensembl.get_taxonomy_by_id(taxon)
        return res

    @load_taxons
    def fetch_by_name(self, name):
        """Search a taxon by its name.

        :param str name: name of an organism. SQL cards possible e.g.,
            _ and % characters.
        :return: a list of possible matches. Each item being a dictionary.

        ::

            >>> ret = s.search_by_name('Mus Musculus')
            >>> ret[0]['id']
            10090

        """
        res = self.ensembl.get_taxonomy_by_name(name)
        return res

    def on_web(self, taxon):
        """Open UniProt page for a given taxon"""
        # Should work for python2 and 3
        import webbrowser
        try:
            from urllib.request import urlopen
            from urllib.error import HTTPError, URLError
        except:
            from urllib2 import urlopen, HTTPError, URLError
        try:
            urlopen('http://www.uniprot.org/taxonomy/%s' % taxon)
            webbrowser.open("http://www.uniprot.org/taxonomy/%s" % taxon)
        except HTTPError as err:
            print("Invalid taxon")
        except URLError as err:
            print(err.args)

    @load_taxons
    def get_lineage(self, taxon):
        """Get lineage of a taxon

        :param int taxon: a known taxon
        :return: list containing the lineage

        """
        # important to reinit the second argument to []
        taxon = int(taxon)
        lineage = self._gen_lineage_and_rank(taxon, [])
        lineage = [x[0] for x in lineage]
        return lineage

    @load_taxons
    def _gen_lineage_and_rank(self, taxon, lineage_rank=[]):
        # recursively filling the lineage argument

        try:
            record = self.records[taxon]
        except:
            return [('unknown_taxon:{}'.format(taxon), 'no rank')]

        parent = int(record['parent'])

        if taxon == 1:
            lineage_rank.append((record['scientific_name'], record['rank']))
            lineage_rank.reverse()
            return lineage_rank
        else:
            lineage_rank.append((record['scientific_name'], record['rank']))
            return self._gen_lineage_and_rank(parent, lineage_rank)

    @load_taxons
    def get_parent_taxon(self, taxon):
        return self.records[taxon]['parent']

    @load_taxons
    def get_parent_name(self, taxon):
        taxid = self.get_parent_taxon(taxon)
        return self.records[taxid]['scientific_name']

    @load_taxons
    def get_lineage_and_rank(self, taxon):
        """Get lineage and rank of a taxon

        :param int taxon:
        :return: a list of tuples. Each tuple is a pair of taxon name/rank
            The list is the lineage for to the input taxon.

        """
        taxon = int(taxon)
        lineage = self._gen_lineage_and_rank(taxon, [])
        return lineage

    @load_taxons
    def get_ranks(self):
        return Counter([x['rank'] for x in self.records.values()])

    @load_taxons
    def get_record_for_given_rank(self, rank):
        return [x for x in self.records.values() if x['rank'] == rank]

    @load_taxons
    def get_names_for_given_rank(self, rank):
        data = [x for x in self.records.values() if x['rank'] == rank]
        return [x['scientific_name'] for x in data]

    @load_taxons
    def get_children(self, taxon):
        taxon = str(taxon)
        children = [
            self.records[k] for k in self.records.keys()
            if self.records[k]['parent'] == taxon
        ]
        children = [child['id'] for child in children]
        return children

    @load_taxons
    def get_family_tree(self, taxon):
        """root is taxon and we return the corresponding tree"""
        # should limit the tree size
        # uniprot flat files has no record about children, so we would
        # need to reconstruct the tree
        tree = {}
        children = self.get_children(taxon)
        if len(children) == 0:
            return tree
        else:
            return [self.get_family_tree(child) for child in children]

    @load_taxons
    def __getitem__(self, iden):
        return self.records[iden]

    @load_taxons
    def __getitem__(self, iden):
        return len(self.records)

    def append_existing_database(self, filename):
        """

        Taxonomy DB looks like::

            ID                        : 2731450
            PARENT ID                 : 1914233
            RANK                      : genus
            SCIENTIFIC NAME           : Limnoglobus
            //


            a = NCBITaxonomy("names.dmp", "nodes.dmp")
            a.create_taxonomy_file("taxonomy.dat")
            tax = Taxonomy()
            tax.append_existing_database("taxonomy.dat")
        """
        tax = Taxonomy(filename)
        tax.load_records()
        self.load_records()
        toadd = []
        for record in tax.records.keys():
            if record not in self.records:
                toadd.append(record)

        with open(self.database, "a") as fout:
            for record in toadd:
                fout.write(tax.records[record]['raw'] + "//\n")
示例#14
0
class KrakenBuilderBase():
    def __init__(self, dbname):
        from bioservices import EUtils
        self.dbname = dbname
        self.eutils = EUtils()
        self.enadb = ENADownload()

        self.category = [
            'archaea', 'bacteria', 'fungi', 'invertebrate', 'mitonchondrion',
            'other', 'plant', 'plasmid', 'plastid', 'protozoa',
            'vertebrate_mammalian', 'vertebrate_other', 'viral'
        ]

    def download_ncbi_refseq(self, category):
        """Download all files of type *fna* from ncbi FTP.

        ::

            kb = KrakenBuilder()
            kb.download_ncbi_refseq("viral")

        """
        import ftplib
        import os
        assert category in self.category, "Please use one of {}".format(
            self.category)

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", "*****@*****.**")
        ftp.cwd("refseq/release/{}".format(category))

        import io
        file_mapper = {}
        for filename in ftp.nlst():
            if "genomic.fna" in filename:
                ftp.retrbinary('RETR ' + filename, open(filename, "wb").write)
                print(filename)

    def download_genomes_from_ncbi(self, email, category):
        """This downloads all genomes on ncbi for a given category looking at
        their ftp. This could be highly redundant.


        """
        assert category in self.category

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", email)
        ftp.cwd("refseq/release/{}".format(category))

        file_mapper = {}
        for filename in ftp.nlst():
            if "genomic.fna" in filename:
                ftp.retrbinary('RETR ' + filename, open(filename, "wb").write)
                logger.info(filename)

    def _download_assembly_report(self, category):
        assert category in self.category

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", "anonymous")
        ftp.cwd("genomes/refseq/{}".format(category))

        filename = "assembly_summary.txt"
        ftp.retrbinary(
            'RETR ' + filename,
            open(filename.replace(".txt", "_{}.txt".format(category)),
                 "wb").write)
        logger.info(filename)

    def download_accession_from_ncbi(self, accession):
        # a list of accessions in a file
        # can be a list, a unique string, a filename with 1-column wit accession
        # to retrieve
        if isinstance(accession, list):
            pass
        elif isinstance(accession, str):
            if os.path.exists(accession):
                with open(accession, "r") as fin:
                    accessions = fin.read().split()
            else:
                accessions = [accession]

        from easydev import Progress
        N = len(accessions)
        pb = Progress(N)
        logger.info("Fetching {} accession fasta files from NCBI".format(N))
        for i, accession in enumerate(accessions):
            data = self.eutils.EFetch("nucleotide",
                                      rettype="fasta",
                                      id=accession,
                                      retmode="text")
            if isinstance(data, int):
                logger.info(
                    "Could not fetch this accession: {}. continue".format(
                        accession))
                print("Could not fetch this accession: {}. continue".format(
                    accession))
            else:
                outname = "{}/library/{}.fa".format(self.dbname, accession)
                with open(outname, "wb") as fout:
                    fout.write(data)
            pb.animate(i + 1)
示例#15
0
accessions = ["O23729", "O23730", "O23731"]
records = []

for accession in accessions:
    handle = ExPASy.get_sprot_raw(accession)
    record = SwissProt.read(handle)
    EC = record.description.split(";")[1].split("=")[1]
    records.append(EC)
    
"""
"""
import mygene

mg = mygene.MyGeneInfo()
g = mg.getgene(100759423)
"""

#https://www.biostars.org/p/104733/
"""
# dela, ampak zelo pocasi...
kegg.find("genes", '100759423')
#Out[386]: 'cge:100759423\tK18311 N-acetylaspartylglutamate/N-acetylaspartylglutamylglutamate synthase [EC:6.3.2.41 6.3.2.42] | (RefSeq) Rimkla; ribosomal modification protein rimK like family member A\n'
"""

#http://nbviewer.jupyter.org/url/pythonhosted.org//bioservices/_downloads/Entrez_EUtils.ipynb
from bioservices import EUtils
e = EUtils()
db = 'gene'
id_list = '100759423'
results = e.EPost(db, id_list)
示例#16
0
        # we found enough lines, get out
        # Removed this line because it was redundant the while will catch
        # it, I left it for history
        # if len(lines_found) > lines:
        #    break

        # decrement the block counter to get the
        # next X bytes
        block_counter -= 1

    return lines_found[-lines:]


# Link to NCBI EServices

s = EUtils()
N = 21
bottom = 1
max_height = 4
theta = range(0, 21)  #np.linspace(0.0, 2 * np.pi, N, endpoint=False)
my_xticks = [
    'BB', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
    'R', 'S', 'T', 'V', 'W', 'Y'
]

for f in os.listdir("."):
    fname = f.split(".")

    # plot SASA from freesasa calculation

    if ".sasa" in f:
示例#17
0
 def __init__(self):
     super(Taxon, self).__init__()
     # self.df = pd.DataFrame(index=[], columns=["Taxon", "Scientific Name"])
     self._eutils_service = EUtils()
     self._ensembl_service = Ensembl() # there is a search by name, easier to use than EUtils