Пример #1
0
    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest::

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        self.services = REST(
            "PSICQUIC",
            verbose=verbose,
            url="https://www.ebi.ac.uk/Tools/webservices/psicquic",
            url_defined_later=True)  # this prevent annoying warning

        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.services.logging.warning(
                "UniProt service could be be initialised")
        self.buffer = {}
Пример #2
0
    def __init__(self, verbose=False, cache=False):
        url = "http://rest.genenames.org/"
        self.services = REST("HGNC", url=url, verbose=verbose, cache=cache)

        self._info = self.get_info()
        self.searchable_fields = self._info['searchableFields']
        self.stored_fields = self._info['storedFields']
Пример #3
0
 def _einfo_rest(self, db=None, **kargs):
     s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
     ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml",
                      params={'tool':kargs.get('tool',self.tool),
                              'email':kargs.get('email',self.email)
                              })
     ret = self.easyXML(ret)
     return ret
Пример #4
0
 def _einfo_rest(self, db=None, **kargs):
     s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
     ret = s.http_get("einfo.fcgi?db=%s" % db,
                      frmt="xml",
                      params={
                          'tool': kargs.get('tool', self.tool),
                          'email': kargs.get('email', self.email)
                      })
     ret = self.easyXML(ret)
     return ret
Пример #5
0
    def __init__(self, verbose=False, email="unknown"):
        #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"

        # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1
        # this url should be use
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"
        super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url)

        warning = """

        NCBI recommends that users post no more than three URL requests per second.
        Failure to comply with this policy may result in an IP address being blocked
        from accessing NCBI. If NCBI blocks an IP address, service will not be
        restored unless the developers of the software accessing the E-utilities
        register values of the tool and email parameters with NCBI. The value of
        email will be used only to contact developers if NCBI observes requests
        that violate our policies, and we will attempt such contact prior to blocking
        access.  For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1

        BioServices does not check if you send more than 3 requests per seconds.
        This is considered to be the user responsability. Within BioServices, we
        fill the parameter **tool** and **email**, however, to fill the later
        you should provide your email either globablly when instanciating EUtils,
        or locally when calling a method.

        This message will not appear if you set the email as a parameter::

            e = EUtils(email="name@adress")

        or in you bioservices configuration file (.config/bioservices/bioservices.cfg)
        under linux with a user section::

            [user]
            email = yourname@somewhere


        """
        # on top of the WSDL protocol we also need a REST for the EFetch method
        # Indeed, although we have a WSDL class for EFetch, it is (i) limited
        # because doc could not be found (ii) required sn instanciation for
        # each database whereas with REST, we ca do it just once
        self._efetch = REST("Efetch",
                            "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")

        self._databases = None
        self.tool = "bioservices"
        self.email = email
        if self.email == "unknown":
            # trying the bioservices config file
            if self.settings.params['user.email'][0] != "unknown":
                self.email = self.settings.params['user.email'][0]
            else:
                self.logging.warning(warning)
Пример #6
0
    def __init__(self, verbose=False, email="unknown"):
        #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"

        # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1
        # this url should be use
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"
        super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url)


        warning = """

        NCBI recommends that users post no more than three URL requests per second.
        Failure to comply with this policy may result in an IP address being blocked
        from accessing NCBI. If NCBI blocks an IP address, service will not be
        restored unless the developers of the software accessing the E-utilities
        register values of the tool and email parameters with NCBI. The value of
        email will be used only to contact developers if NCBI observes requests
        that violate our policies, and we will attempt such contact prior to blocking
        access.  For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1

        BioServices does not check if you send more than 3 requests per seconds.
        This is considered to be the user responsability. Within BioServices, we
        fill the parameter **tool** and **email**, however, to fill the later
        you should provide your email either globablly when instanciating EUtils,
        or locally when calling a method.

        This message will not appear if you set the email as a parameter::

            e = EUtils(email="name@adress")

        or in you bioservices configuration file (.config/bioservices/bioservices.cfg)
        under linux with a user section::

            [user]
            email = yourname@somewhere


        """
        # on top of the WSDL protocol we also need a REST for the EFetch method
        # Indeed, although we have a WSDL class for EFetch, it is (i) limited
        # because doc could not be found (ii) required sn instanciation for
        # each database whereas with REST, we ca do it just once
        self._efetch = REST("Efetch","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")

        self._databases = None
        self.tool = "bioservices"
        self.email = email
        if self.email == "unknown":
            # trying the bioservices config file
            if self.settings.params['user.email'][0]!="unknown":
                self.email = self.settings.params['user.email'][0]
            else:
                self.logging.warning(warning)
Пример #7
0
class EUtils(WSDLService):
    """Interface to `NCBI Entrez Utilities <http://www.ncbi.nlm.nih.gov/entrez/query/static/esoap_help.html>`_ service

    The EUtils class has a method called EFetch so this is actually covering
    all Entrez functionalities.

    Note that we use the WSDL protocol for all EUtils but we had to use the REST
    service in a few cases.

    .. warning:: Read the `guidelines
        <http://www.ncbi.nlm.nih.gov/books/NBK25497/>`_ before sending requests.
        No more than 3 requests per seconds otherwise your IP may be banned.
        You should provide your email by filling the :attr:`email` so that
        before being banned, you may be contacted.

    Here is an example on how to use :method:`EFetch` method to retrieve the
    FASTA sequence of a given identifier (34577063)::

        >>> from bioservices import EUtils
        >>> s = EUtils()
        >>> print(s.EFetch("sequences", "34577063", rettype="fasta"))
        >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens]
        MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV
        VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH
        QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE
        IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC
        TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH
        MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE
        LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF

    Most of the methods take a database name as input. You can obtain the
    valid list by checking the :attr:`databases` attribute.

    A few functions takes Identifier(s) as input. It could be a list of strings,
    list of numbers, or a string where identifiers are separated either by
    comma or spaces.

    A few functions takes an argument called **term**. You can use the **AND**
    keyword with spaces or + signs as separators::

        Correct:   term=biomol mrna[properties] AND mouse[organism]
        Correct:   term=biomol+mrna[properties]+AND+mouse[organism]

    Other special characters, such as quotation marks (") or the # symbol used
    in referring to a query key on the History server, could be represented by
    their URL encodings (%22 for "; %23 for #) or verbatim .::

        Correct: term=#2+AND+"gene in genomic"[properties]
        Correct: term=%232+AND+%22gene+in+genomic%22[properties]

    .. note:: most of the parameter names are identical to the expected names
        except for **id**, which has been replaced by **sid**.

    """
    def __init__(self, verbose=False, email="unknown"):
        #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"

        # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1
        # this url should be use
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"
        super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url)


        warning = """

        NCBI recommends that users post no more than three URL requests per second.
        Failure to comply with this policy may result in an IP address being blocked
        from accessing NCBI. If NCBI blocks an IP address, service will not be
        restored unless the developers of the software accessing the E-utilities
        register values of the tool and email parameters with NCBI. The value of
        email will be used only to contact developers if NCBI observes requests
        that violate our policies, and we will attempt such contact prior to blocking
        access.  For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1

        BioServices does not check if you send more than 3 requests per seconds.
        This is considered to be the user responsability. Within BioServices, we
        fill the parameter **tool** and **email**, however, to fill the later
        you should provide your email either globablly when instanciating EUtils,
        or locally when calling a method.

        This message will not appear if you set the email as a parameter::

            e = EUtils(email="name@adress")

        or in you bioservices configuration file (.config/bioservices/bioservices.cfg)
        under linux with a user section::

            [user]
            email = yourname@somewhere


        """
        # on top of the WSDL protocol we also need a REST for the EFetch method
        # Indeed, although we have a WSDL class for EFetch, it is (i) limited
        # because doc could not be found (ii) required sn instanciation for
        # each database whereas with REST, we ca do it just once
        self._efetch = REST("Efetch","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")

        self._databases = None
        self.tool = "bioservices"
        self.email = email
        if self.email == "unknown":
            # trying the bioservices config file
            if self.settings.params['user.email'][0]!="unknown":
                self.email = self.settings.params['user.email'][0]
            else:
                self.logging.warning(warning)

    def _get_databases(self):
        """alias to run_eInfo"""
        if self._databases is None:
            # DbData changed into DbList in rev 1.3.0
            self._databases = sorted(self.serv.run_eInfo().DbList.DbName)
        return self._databases
    databases = property(_get_databases, doc="Returns list of valid databases")

    def _check_db(self, db):
        if db not in self.databases:
            raise ValueError("You must provide a valid databases from : ", self.databases)

    def _check_retmode(self, retmode):
        if retmode not in ["xml", "text"]:
            raise ValueError("You must provide a retmode in 'xml', 'text'")

    def get_einfo_params(self, **kargs):
        return self.wsdl_create_factory("nsei:eInfoRequest", **kargs)

    def get_esummary_params(self, **kargs):
        return self.wsdl_create_factory("nsesu:eSummaryRequest", **kargs)

    def get_esearch_params(self, **kargs):
        return self.wsdl_create_factory("nsese:eSearchRequest", **kargs)

    def get_egquery_params(self, **kargs):
        return self.wsdl_create_factory("nseg:eGqueryRequest", **kargs)

    def get_espell_params(self, **kargs):
        return self.wsdl_create_factory("nsesp:eSpellRequest", **kargs)

    def get_elink_params(self, **kargs):
        return self.wsdl_create_factory("nsel:eLinkRequest", **kargs)

    def get_epost_params(self, **kargs):
        return self.wsdl_create_factory("nseps:ePostRequest", **kargs)

    def _check_ids(self, sid):
        if isinstance(sid, int):
            sid = [sid]
        if isinstance(sid, list):
            sid = ",".join([str(x) for x in sid])

        # If there are commas, let us split, strip spaces and join back the ids
        sid = ",".join([x.strip() for x in sid.split(',') if x.strip()!=""])

        if len(sid.split(","))>200:
            raise ValueError("Number of comma separated IDs must be less than 200")
        return sid

    def taxonomy(self, sid, raw=False):
        """Alias to EFetch for ther taxonomy database using WSDL

        ::

            >>> s = EUtils()
            >>> ret = s.taxonomy("9606")
            >>> ret.Taxon.TaxId
            '9606'
            >>> ret.Taxon.ScientificName
            'H**o sapiens'
            >>> ret = s.taxonomy("9606,9605,111111111,9604")
            >>> ret.Taxon[2].TaxId
            '9604'


        """
        sid = self._check_ids(sid)
        serv = EFetch("taxon")
        ret = serv.efetch(sid)
        if raw:
            return ret
        else:
            return ret.TaxaSet

    def snp(self, sid):
        """Alias to Efetch for the SNP database using WSDL

        ::

            >>> s.snp("123")

        """
        serv = EFetch("snp")
        ret = serv.efetch(sid)
        return ret

    def EFetch(self, db, sid=None, retmode="text", **kargs):
        """Access to the EFetch E-Utilities

        :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database
            name . This is the destination database for the link operation.
        :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided.
            All of the UIDs must be from the database specified by db. Limited
            to 200 sid

        :param retmode: default to text (could be xml but not recommended).
        :param rettype: could be fasta, summar      :param rettype: could be
        fasta, summaryy

        ::

            >>> ret = s.EFetch("omim", "269840")  --> ZAP70
            >>> ret = s.EFetch("taxonomy", "9606", retmode="xml")
            >>> [x.text for x in ret.getchildren()[0].getchildren() if x.tag=="ScientificName"]
            ['H**o sapiens']

            >>> s = eutils.EUtils()
            >>> s.EFetch("sequences", "34577063", retmode="text", rettype="fasta")
            >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens]
            MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV
            VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH
            QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE
            IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC
            TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH
            MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE
            LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF


        Identifiers could be provided as a single string with comma-separated
        values, or a list of strings, a list of integers, or just one
        string or one integer but no mixing of types in the list::

            >>> e.EFetch("sequences", "352, 234", retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", 352, retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", [352], retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", [352, 234], retmode="text", rettype="fasta")


        **retmode** should be xml or text depending on the database. For instance, xml fo
        pubmed::

            >>> e.EFetch("pubmed", "20210808", retmode="xml")
            >>> e.EFetch('nucleotide', id=15, retmode='xml')
            >>> e.EFetch('nucleotide', id=15, retmode='xml', rettype='fasta')
            >>> e.EFetch('nucleotide', 'NT_019265', rettype='gb')

        eutils.EUtilsParser(e.EFetch("taxonomy", "9685", retmode="xml")
        .. todo:: more documentation and optional arguments

        Other special characters, such as quotation marks (") or the # symbol
        used in referring to a query key on the History server, should be
        represented by their URL encodings (%22 for "; %23 for #).
        """
        #self._check_db(db)
        self._check_retmode(retmode)
        if sid is not None:
            sid = self._check_ids(sid)

        params = {'db':db, 'id':sid, 'retmode':retmode, 'tool':self.tool,
                'email': self.email}
        if kargs.get("strand"):
            strand = kargs.get("strand")
            self.devtools.check_param_in_list(strand, [1,2])
            params['strand'] = strand

        if kargs.get("complexity"):
            complexity = kargs.get("complexity")
            if complexity in [0,1,2,3,4]:
                params['complexity'] = complexity
            else:
                raise ValueError("invalid complexity. must be a number in 0,1,2,3,4")

        for param in ['retmax', 'seq_start', "seq_stop", "rettype", "query_key", "WebEnv"]:
            if kargs.get(param):
                params[param] = kargs.get(param)

        #print(params)
        if retmode == "xml":
            ret = self._efetch.http_get("efetch.fcgi", 'xml', params=params)
            ret = self.easyXML(ret)
        else:
            ret = self._efetch.http_get("efetch.fcgi", 'txt', params=params)


        return ret

    def EInfo(self, db=None, **kargs):
        """Provides the number of records indexed in each field of a given
        database, the date of the last update of the database, and the available links
        from the database to other Entrez databases.

        :param str db: target database about which to gather statistics. Value must be a
            valid Entrez database name. See :attr:`databases` or don't provide
            any value to obtain the entire list
        :return: either a list of databases, or a dictionary with relevant information
            about the requested database

        ::

            >>> all_database_names = s.EInfo()
            >>> # specific info about one database:
            >>> ret = s.EInfo("taxonomy")
            >>> ret.Count
            >>> ret.Name
            >>> ret = s.EInfo('pubmed')
            >>> res.FieldList[2].FullName
            'Filter'

        """
        if db is None:
            return self.databases
        else:
            self._check_db(db)

        # WSDL does not work, let us use rest instead.
        ret = self._einfo_rest(db, **kargs)
        ret = EUtilsParser(ret)
        return ret

    def _einfo_rest(self, db=None, **kargs):
        s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml",
                         params={'tool':kargs.get('tool',self.tool),
                                 'email':kargs.get('email',self.email)
                                 })
        ret = self.easyXML(ret)
        return ret

    """Does not work...issue with DbBuil
    # ret = self._einfo_wsdl(db, **kargs)
    def _einfo_wsdl(self, db=None, **kargs):
        params = self.suds.factory.create("nsei:eInfoRequest", **kargs)
        params.db = db
        params.tool = self.tool[:]
        params.email = self.email[:]
        return self.serv.run_eInfo(db, params)
    """




    def ESummary(self, db, sid=None,  **kargs):
        """Returns document summaries for a list of input UIDs


        :param str sid: list of identifiers (or string comma separated).
            all of the UIDs must be from the database specified by db. Limited
            to 200 sid

        ::

            >>> from bioservices import *
            >>> s = EUtils()
            >>> ret = s.ESummary("snp","7535")
            >>> ret = s.ESummary("snp","7535,7530")
            >>> ret = s.ESummary("taxonomy", "9606,9913")

        ::

            >>> proteins = e.ESearch("protein", "bacteriorhodopsin", RetMax=20,)
            >>> ret = e.ESummary("protein", proteins.IdList.Id[0])
            >>> ret.DocSum[0].Item[2]
            (ItemType){
               _Type = "String"
               _Name = "Extra"
               ItemContent = "gi|6320236|ref|NP_010316.1|[6320236]"
            }


        """
        if sid is not None:
            sid = self._check_ids(sid)

        if db is None:
            return self.databases
        else:
            self._check_db(db)

        params = self.get_esummary_params(**kargs)
        params.db = db
        params.id = sid
        ret = self.serv.run_eSummary(**dict(params))
        return ret

    def _esummary_rest(self, db, sid):
        # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]]
        s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None)
        ret = self.easyXML(ret)
        return ret



    def EGQuery(self, term, **kargs):
        """Provides the number of records retrieved in all Entrez databases by a text query.

        :param str term: Entrez text query. All special characters must be URL
            encoded. Spaces may be replaced by '+' signs. For very long queries (more than
            several hundred characters long), consider using an HTTP POST call. See the
            PubMed or Entrez help for information about search field descriptions and tags.
            Search fields and tags are database specific.

        ::

            >>> ret = s.EGQuery("asthma")
            >>> [(x.DbName, x.Count) for x in ret.eGQueryResult.ResultItem if x.Count!='0']

            >>> ret = s.EGQuery("asthma")
            >>> ret.eGQueryResult.ResultItem[0]
            >>> ret.Term

        """
        params = self.get_egquery_params(**kargs)
        ret = self.serv.run_eGquery(term, params)
        return ret

    def ESearch(self, db, term, **kargs):
        """Responds to a query in a given  database


        The response can be used later in ESummary, EFetch or ELink, along with
        the term translations of the query.

        :param db:
        :param term:

        .. note:: see :meth:`get_esearch_params` for the list of valid parameters.

        ::

            >>> ret = e.ESearch('protein', 'human', RetMax=5)
            >>> ret = e.ESearch('taxonomy', 'Staphylococcus aureus[all names]')
            >>> ret = e.ESearch('pubmed', "cokelaer AND BioServices")
            >>> # There is on identifier in the IdList (therefore the first element)
            >>> identifiers = e.pubmed(ret.IdList.Id)


        More complex requests can be used. We will not cover all the possiblities (see the
        NCBI website). Here is an example to tune the search term to look into
        PubMed for the journal PNAS Volume 16, and retrieve.::

            >>> e.ESearch("pubmed", "PNAS[ta] AND 16[vi]")


        You can then look more closely at a specific identifier using EFetch::

            >>> e = EFetch("pubmed")
            >>> e.efetch(identifiers)


        .. note:: valid parameters can be found by calling :meth:`get_esearch_params`
        """
        params = self.get_esearch_params(**kargs)
        params['db'] = db
        params['term'] = term
        # the API requires the db and term paramters to be provided
        # as positional arguments. The db and term attribute in the
        # params structure are just ignored. Note, however, that
        # the db and term parameter must also be provided in the params
        # dict so that other argument are also used... wierd
        ret = self.serv.run_eSearch(db, term, params)
        return ret

    #def _egquery_rest(self, term, retmode="xml"):
    #    self._check_retmode(retmode)
    #    s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
    #    ret = s.request("egquery.fcgi?term=%s&retmode=%s" % (term, retmode))
    #    return ret



    def ESpell(self, db, term, **kargs):
        """Retrieve spelling suggestions for a text query in a given database.

        :param str db: database to search. Value must be a valid Entrez
            database name (default = pubmed).
        :param str term: Entrez text query. All special characters must be
            URL encoded.

        ::

            >>> ret = e.ESpell(db="omim", term="aasthma+OR+alergy")
            >>> ret.Query
            'asthmaa OR alergies'
            >>> ret.CorrectedQuery
            'asthma or allergy'
            >>> ret = e.ESpell(db="pubmed", term="biosservices")
            >>> ret.CorrectedQuery
            bioservices


        .. note:: only WSDL protocol available
        """
        params = self.get_espell_params(**kargs)
        self._check_db(db)
        ret = self.serv.run_eSpell(db, term, params)
        return ret

    def ELink(self, dbfrom, sid=None, **kargs):
        """The Entrez links utility

        Responds to a list of UIDs in a given database with either a list of
        related UIDs (and relevancy scores) in the same database or a list of linked
        UIDs in another Entrez database; checks for the existence of a specified link
        from a list of one or more UIDs; creates a hyperlink to the primary LinkOut
        provider for a specific UID and database, or lists LinkOut URLs and attributes
        for multiple UIDs.

        :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database
            name. This is the destination database for the link operation.
        :param str dbfrom: Database containing the input UIDs. The value must be a
            valid Entrez database name (default = pubmed). This is the origin database of
            the link operation. If db and dbfrom are set to the same database value, then
            ELink will return computational neighbors within that database. Please see the
            full list of Entrez links for available computational neighbors. Computational
            neighbors have linknames that begin with dbname_dbname (examples:
            protein_protein, pcassay_pcassay_activityneighbor).
        :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided.
            All of the UIDs must be from the database specified by db. Limited  to 200 Ids
        :param str cmd: ELink command mode. The command mode specified which
            function ELink will perform. Some optional parameters only function for certain
            values of cmd (see http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink).
            Examples are neighbor, prlinks.

        ::

            >>> # Example: Find related articles to PMID 20210808
            >>> ret = s.ELink("pubmed", sid="20210808", cmd="neighbor_score")
            >>> ret.LinkSet[0].LinkSetDb[0].Link[0].Id


            # FIXME: change example values
            >>> s.Elink(dbfrom="nucleotide", db="protein",
                              id="48819,7140345")
            >>> s.Elink(dbfrom="nucleotide", db="protein",
                              id="48819,7140345")

            LinkSetDb, DbFrom , IdList

        .. todo:: remove LinkSet : there is only 1 set ?
        """
        if sid is not None:
            sid = self._check_ids(sid)
        self._check_db(dbfrom)
        if 'cmd' in kargs.keys():
            assert kargs['cmd'] in ["neighbor", "neighbor_score",
                    "neighbor_history", "acheck", "llinks", "lcheck",
                    "ncheck", "llinkslib", "prlinks"]

        #s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        #request = "elink.fcgi?db=%s&dbfrom=%s" % (db, dbfrom)
        #request += "&id=%s" % sid
        #request += "&cmd=%s" % cmd
        #ret = s.request(request)
        #return ret
        params = self.get_elink_params(**kargs)
        params.dbfrom = dbfrom
        params.id = sid

        ret = self.serv.run_eLink(**dict(params))
        return ret

    def EPost(self, db, sid, **kargs):
        """Accepts a list of UIDs from a given database,

        stores the set on the History Server, and responds with a query
        key and web environment for the uploaded dataset.

        :param str db: a valid database
        :param id: list of strings of strings


        """
        params = self.get_epost_params(**kargs)
        params.id = sid
        params.db = db
        ret = self.serv.run_ePost(**dict(params))
        return ret
Пример #8
0
 def _esummary_rest(self, db, sid):
     # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]]
     s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
     ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None)
     ret = self.easyXML(ret)
     return ret
Пример #9
0
class PSICQUIC():
    """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service

    There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used
    the REST only.


    This service provides a common interface to more than 25 other services
    related to protein. So, we won't detail all the possiblity of this service.
    Here is an example that consists of looking for interactors of the
    protein ZAP70 within the IntAct database::

        >>> from bioservices import *
        >>> s = PSICQUIC()
        >>> res = s.query("intact", "zap70")
        >>> len(res) # there are 11 interactions found
        11
        >>> for x in res[1]:
        ...     print(x)
        uniprotkb:O95169
        uniprotkb:P43403
        intact:EBI-716238
        intact:EBI-1211276
        psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI
        .
        .

    Here we have a list of entries. There are 15 of them (depending on
    the *output* parameter). The meaning of the entries is described on PSICQUIC
    website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short:


    #. Unique identifier for interactor A
    #. Unique identifier for interactor B.
    #. Alternative identifier for interactor A, for example the official gene
    #. Alternative identifier for interactor B.
    #. Aliases for A, separated by "|
    #. Aliases for B.
    #. Interaction detection methods, taken from the corresponding PSI-MI
    #. First author surname(s) of the publication(s)
    #. Identifier of the publication
    #. NCBI Taxonomy identifier for interactor A.
    #. NCBI Taxonomy identifier for interactor B.
    #. Interaction types,
    #. Source databases and identifiers,
    #. Interaction identifier(s) i
    #. Confidence score. Denoted as scoreType:value.

    Another example with reactome database::

        res = s.query("reactome", "Q9Y266")

    .. warning:: PSICQUIC gives access to 25 other services. We cannot create
        a dedicated parsing for all of them. So, the ::`query` method returns
        the raw data. Addition class may provide dedicated parsing in the
        future.

    .. seealso:: :class:`bioservices.biogrid.BioGRID`
    """

    _formats = [
        "tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml",
        "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"
    ]

    # note the typo in "genbank indentifier from bind DB
    _mapping_uniprot = {
        "genbank indentifier": "P_GI",
        'entrezgene/locuslink': "P_ENTREZGENEID",
        'uniprotkb': "ACC+ID",
        'rcsb pdb': "PDB_ID",
        'ensembl': "ENSEMBL_ID",
        'refseq': "P_REFSEQ_AC",
        'hgnc': 'HGNC_ID',
        "kegg": "KEGG_ID",
        "entrez gene/locuslink": "P_ENTREZGENEID",
        "chembl": "CHEMBL_ID",
        "ddbj/embl/genbank": "EMBL_ID",
        "dip": "DIP_ID",
        "ensemblgenomes": "ENSEMBLGENOME_ID",
        "omim": "MIM_ID",
        "chebi": None,
        "chembl": None,
        #        "intact": None
    }

    # unknown: hprd, omim, bind, bind complexid, mdl,

    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest::

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        self.services = REST(
            "PSICQUIC",
            verbose=verbose,
            url="https://www.ebi.ac.uk/Tools/webservices/psicquic",
            url_defined_later=True)  # this prevent annoying warning

        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.services.logging.warning(
                "UniProt service could be be initialised")
        self.buffer = {}

    def _get_formats(self):
        return PSICQUIC._formats

    formats = property(_get_formats, doc="Returns the possible output formats")

    def _get_active_db(self):
        names = self.registry_names[:]
        actives = self.registry_actives[:]
        names = [x.lower() for x, y in zip(names, actives) if y == "true"]
        return names

    activeDBs = property(_get_active_db, doc="returns the active DBs only")

    def read_registry(self):
        """Reads and returns the active registry

        """
        url = 'registry/registry?action=ACTIVE&format=txt'
        res = self.services.http_get(url, frmt='txt')
        return res.split()

    def print_status(self):
        """Prints the services that are available

        :return: Nothing

        The output is tabulated. The columns are:

        * names
        * active
        * count
        * version
        * rest URL
        * soap URL
        * rest example
        * restricted

        .. seealso:: If you want the data into lists, see all attributes
            starting with registry such as :meth:`registry_names`
        """
        url = 'registry/registry?action=STATUS&format=xml'
        res = self.services.http_get(url, frmt="txt")

        names = self.registry_names
        counts = self.registry_counts
        versions = self.registry_versions
        actives = self.registry_actives
        resturls = self.registry_resturls
        soapurls = self.registry_soapurls
        restexs = self.registry_restexamples
        restricted = self.registry_restricted
        N = len(names)

        indices = sorted(range(0, N), key=lambda k: names[k])

        for i in range(0, N):
            print("%s\t %s\t %s\t %s\t %s %s %s %s\n" %
                  (names[i], actives[i], counts[i], versions[i], resturls[i],
                   soapurls[i], restexs[i], restricted[i]))

    # todo a property for the version of PISCQUIC

    def _get_registry(self):
        if self._registry is None:
            url = 'registry/registry?action=STATUS&format=xml'
            res = self.services.http_get(url, frmt="xml")
            res = self.services.easyXML(res)
            self._registry = res
        return self._registry

    registry = property(_get_registry, doc="returns the registry of psicquic")

    def _get_registry_names(self):
        res = self.registry
        return [x.findAll('name')[0].text for x in res.findAll("service")]

    registry_names = property(_get_registry_names,
                              doc="returns all services available (names)")

    def _get_registry_restricted(self):
        res = self.registry
        return [
            x.findAll('restricted')[0].text for x in res.findAll("service")
        ]

    registry_restricted = property(_get_registry_restricted,
                                   doc="returns restricted status of services")

    def _get_registry_resturl(self):
        res = self.registry
        data = [x.findAll('resturl')[0].text for x in res.findAll("service")]
        return data

    registry_resturls = property(_get_registry_resturl,
                                 doc="returns URL of REST services")

    def _get_registry_restex(self):
        res = self.registry
        data = [
            x.findAll('restexample')[0].text for x in res.findAll("service")
        ]
        return data

    registry_restexamples = property(
        _get_registry_restex, doc="retuns REST example for each service")

    def _get_registry_soapurl(self):
        res = self.registry
        return [x.findAll('soapurl')[0].text for x in res.findAll("service")]

    registry_soapurls = property(_get_registry_soapurl,
                                 doc="returns URL of WSDL service")

    def _get_registry_active(self):
        res = self.registry
        return [x.findAll('active')[0].text for x in res.findAll("service")]

    registry_actives = property(_get_registry_active,
                                doc="returns active state of each service")

    def _get_registry_count(self):
        res = self.registry
        return [x.findAll('count')[0].text for x in res.findAll("service")]

    registry_counts = property(_get_registry_count,
                               doc="returns number of entries in each service")

    def _get_registry_version(self):
        res = self.registry
        names = [x.findAll('name')[0].text for x in res.findAll("service")]
        N = len(names)
        version = [0] * N
        for i in range(0, N):
            x = res.findAll("service")[i]
            if x.findAll("version"):
                version[i] = x.findAll("version")[0].text
            else:
                version[i] = None
        return version

    registry_versions = property(_get_registry_version,
                                 doc="returns version of each service")

    def query(self,
              service,
              query,
              output="tab25",
              version="current",
              firstResult=None,
              maxResults=None):
        """Send a query to a specific database

        :param str service: a registered service. See :attr:`registry_names`.
        :param str query: a valid query. Can be `*` or a protein name.
        :param str output: a valid format. See s._formats

        ::

            s.query("intact", "brca2", "tab27")
            s.query("intact", "zap70", "xml25")
            s.query("matrixdb", "*", "xml25")

        This is the programmatic approach to this website:

        http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml


        Another example consist in accessing the *string* database for fetching
        protein-protein interaction data of a particular model organism. Here we
        restrict the query to 100 results::

            s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25")

        # spaces are automatically converted

            s.query("biogrid", "ZAP70 AND species:9606")

        .. warning:: AND must be in big caps. Some database are ore permissive
            than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more
            permissive and may accept the name (e.g., human)

        To obtain the number of interactions in intact for the human specy::

            >>> len(p.query("intact", "species:9606"))


        """
        if service not in self.activeDBs:
            raise ValueError("database %s not in active databases" % service)

        params = {}
        if output is not None:
            self.services.devtools.check_param_in_list(output, self.formats)
            params['format'] = output
        else:
            output = "none"

        names = [x.lower() for x in self.registry_names]
        try:
            index = names.index(service)
        except ValueError:
            self.logging.error(
                "The service you gave (%s) is not registered. See self.registery_names"
                % service)
            raise ValueError

        # get the base url according to the service requested
        resturl = self.registry_resturls[index]

        if firstResult is not None:
            params['firstResult'] = firstResult
        if maxResults is not None:
            params['maxResults'] = maxResults

        url = resturl + 'query/' + query

        if "xml" in output:
            res = self.services.http_get(url, frmt="xml", params=params)
        else:
            res = self.services.http_get(url, frmt="txt", params=params)
            res = res.strip().split("\n")

        if output.startswith("tab"):
            res = self._convert_tab2dict(res)

        return res

    def _convert_tab2dict(self, data):
        """

        https://code.google.com/p/psicquic/wiki/MITAB26Format
        """
        results = []
        for line in data:
            results.append(line.split("\t"))

        return results

    def queryAll(self,
                 query,
                 databases=None,
                 output="tab25",
                 version="current",
                 firstResult=None,
                 maxResults=None):
        """Same as query but runs on all active database

        :param list databases: database to query. Queries all active DB if not provided
        :return: dictionary where keys correspond to databases and values to the output of the query.

        ::

            res = s.queryAll("ZAP70 AND species:9606")
        """

        results = {}
        if databases is None:
            databases = [x.lower() for x in self.activeDBs]

        for x in databases:
            if x not in self.activeDBs:
                raise ValueError("database %s not in active databases" % x)

        for name in databases:
            self.logging.warning("Querying %s" % name),
            res = self.query(name,
                             query,
                             output=output,
                             version=version,
                             firstResult=firstResult,
                             maxResults=maxResults)
            if output.startswith("tab25"):
                results[name] = [x for x in res if x != [""]]
            else:
                import copy
                results[name] = copy.copy(res)
        for name in databases:
            self.logging.info("Found %s in %s" % (len(results[name]), name))
        return results

    def getInteractionCounter(self, query):
        """Returns a dictionary with database as key and results as values

        :param str query: a valid query
        :return: a dictionary which key as database and value as number of entries

        Consider only the active database.

        """
        # get the active names only
        activeDBs = self.activeDBs[:]
        res = [(str(name), int(self.query(name, query, output="count")[0]))
               for name in activeDBs]
        return dict(res)

    def getName(self, data):
        idsA = [x[0] for x in data]
        idsB = [x[1] for x in data]
        return idsA, idsB

    def knownName(self, data):
        """Scan all entries (MITAB) and returns simplified version


        Each item in the input list of mitab entry
        The output is made of 2 lists corresponding to
        interactor A and B found in the mitab entries.

        elements in the input list takes the following forms::

            DB1:ID1|DB2:ID2
            DB3:ID3

        The | sign separates equivalent IDs from different databases.

        We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known
        database is found, then we keep the first one whatsover.

        known databases are those available in the uniprot mapping tools.

        chembl and chebi IDs are kept unchanged.


        """
        self.logging.info("converting data into known names")
        idsA = [x[0].replace("\"", "") for x in data]
        idsB = [x[1].replace("\"", "") for x in data]
        # extract the first and second ID but let us check if it is part of a
        # known uniprot mapping.Otherwise no conversion will be possible.
        # If so, we set the ID to "unknown"
        # remove the " character that can be found in a few cases (e.g,
        # chebi:"CHEBI:29036")
        #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA]
        #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB]

        # special case:
        # in mint, there is an entry that ends with a | uniprotkb:P17844|
        idsA = [x.strip("|") for x in idsA]
        idsB = [x.strip("|") for x in idsB]

        # the first ID
        for i, entry in enumerate(idsA):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db, ID) for db, ID in zip(dbs, IDs)
                             if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs) >= 1:
                    idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug(
                        "none of the DB for this entry (%s) are available" %
                        (entry))
                    idsA[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsA[
                    i] = "??:" + entry  # we add a : so that we are sure that a split(":") will work
        # the second ID
        for i, entry in enumerate(idsB):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db, ID) for db, ID in zip(dbs, IDs)
                             if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs) >= 1:
                    idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug(
                        "none of the DB (%s) for this entry are available" %
                        (entry))
                    idsB[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsB[i] = "??:" + entry

        countA = len([x for x in idsA if x.startswith("?")])
        countB = len([x for x in idsB if x.startswith("?")])
        if countA + countB > 0:
            self.logging.warning("%s ids out of %s were not identified" %
                                 (countA + countB, len(idsA) * 2))
            print(set([x.split(":")[0] for x in idsA if x.startswith("?")]))
            print(set([x.split(":")[0] for x in idsB if x.startswith("?")]))
        self.logging.info("knownName done")
        return idsA, idsB

    def preCleaning(self, data):
        """remove entries ehre IdA or IdB is set to "-"

        """
        ret = [x for x in data if x[0] != "-" and x[1] != "-"]
        return ret

    def postCleaningAll(self,
                        data,
                        keep_only="HUMAN",
                        flatten=True,
                        verbose=True):
        """

        even more cleaing by ignoring score, db and interaction
        len(set([(x[0],x[1]) for x in retnew]))
        """
        results = {}
        for k in data.keys():
            self.logging.info("Post cleaning %s" % k)
            ret = self.postCleaning(data[k],
                                    keep_only="HUMAN",
                                    verbose=verbose)
            if len(ret):
                results[k] = ret
        if flatten:
            results = [x for k in results.keys() for x in results[k]]
        return results

    def postCleaning(self,
                     data,
                     keep_only="HUMAN",
                     remove_db=["chebi", "chembl"],
                     keep_self_loop=False,
                     verbose=True):
        """Remove entries with a None and keep only those with the keep pattern

        """
        if verbose: print("Before removing anything: ", len(data))

        data = [x for x in data if x[0] is not None and x[1] is not None]
        if verbose: print("After removing the None: ", len(data))

        data = [
            x for x in data
            if x[0].startswith("!") is False and x[1].startswith("!") is False
        ]
        if verbose: print("After removing the !: ", len(data))

        for db in remove_db:
            data = [x for x in data if x[0].startswith(db) is False]
            data = [x for x in data if x[1].startswith(db) is False]
            if verbose:
                print("After removing entries that match %s : " % db,
                      len(data))

        data = [x for x in data if keep_only in x[0] and keep_only in x[1]]
        if verbose:
            print("After removing entries that don't match %s : " % keep_only,
                  len(data))

        if keep_self_loop is False:
            data = [x for x in data if x[0] != x[1]]
            if verbose: print("After removing self loop : ", len(data))

        data = list(set(data))
        if verbose: print("After removing identical entries", len(data))

        return data

    def convertAll(self, data):
        results = {}
        for k in data.keys():
            self.logging.info("Analysing %s" % k)
            results[k] = self.convert(data[k], db=k)
        return results

    def convert(self, data, db=None):
        self.logging.debug("converting the database %s" % db)
        idsA, idsB = self.knownName(data)
        mapping = self.mappingOneDB(data)
        results = []
        for i, entry in enumerate(data):
            x = idsA[i].split(":", 1)[1]
            y = idsB[i].split(":", 1)[1]
            xp = mapping[x]
            yp = mapping[y]
            try:
                ref = entry[8]
            except:
                ref = "?"
            try:
                score = entry[14]
            except:
                score = "?"
            try:
                interaction = entry[11]
            except:
                interaction = "?"
            results.append((xp, yp, score, interaction, ref, db))
        return results

    def mappingOneDB(self, data):
        query = {}
        self.logging.debug(
            "converting IDs with proper DB name (knownName function)")
        entriesA, entriesB = self.knownName(
            data
        )  # idsA and B contains list of a single identifier of the form db:id
        # the db is known from _mapping.uniprot otherwise it is called "unknown"

        # get unique DBs to build the query dictionary
        dbsA = [x.split(":")[0] for x in entriesA]
        dbsB = [x.split(":")[0] for x in entriesB]
        for x in set(dbsA):
            query[x] = set()
        for x in set(dbsB):
            query[x] = set()
        for k in query.keys():
            if k.startswith("?"):
                del query[k]

        # the data to store
        mapping = {}
        N = len(data)

        # scan all entries
        counter = 0
        for entryA, entryB in zip(entriesA, entriesB):
            counter += 1
            dbA, idA = entryA.split(":")
            try:
                dbB, idB = entryB.split(":")
            except:
                print(entryB)
            if idA not in mapping.keys():
                if dbA.startswith("?"):
                    mapping[idA] = entryA
                else:
                    query[dbA].add(idA)
            if idB not in mapping.keys():
                if dbB.startswith("?"):
                    mapping[idB] = entryB
                else:
                    query[dbB].add(idB)

            for k in query.keys():
                if len(query[k]) > 2000 or counter == N:
                    this_query = list(query[k])
                    DBname = self._mapping_uniprot[k]

                    if DBname is not None:
                        self.logging.warning(
                            "Request sent to uniprot for %s database (%s/%s)" %
                            (DBname, counter, N))
                        res = self.uniprot.mapping(fr=DBname,
                                                   to="ID",
                                                   query=" ".join(this_query))
                        for x in this_query:
                            if x not in res:  #was not found
                                mapping[x] = "!" + k + ":" + x
                            else:
                                # we should be here since the queries are populated
                                # if not already in the mapping dictionary
                                if x not in res.keys():
                                    raise ValueError(x)
                                if len(res[x]) == 1:
                                    mapping[x] = res[x][0]
                                else:
                                    self.logging.warning(
                                        "psicquic mapping found more than 1 id. keep first one"
                                    )
                                    mapping[x] = res[x][0]
                    else:
                        for x in this_query:
                            mapping[x] = k + ":" + x
                    query[k] = set()

        for k in query.keys():
            assert len(query[k]) == 0
        return mapping
Пример #10
0
class HGNC():
    """Wrapper to the genenames web service


    See details at http://www.genenames.org/help/rest-web-service-help

    """
    def __init__(self, verbose=False, cache=False):
        url = "http://rest.genenames.org/"
        self.services = REST("HGNC", url=url, verbose=verbose, cache=cache)

        self._info = self.get_info()
        self.searchable_fields = self._info['searchableFields']
        self.stored_fields = self._info['storedFields']

    def get_info(self, frmt='json'):
        """Request information about the service

        Fields are when the server was last updated (lastModified),
        the number of documents (numDoc), which fields can be queried
        using search and fetch (searchableFields) and which fields may
        be returned by fetch (storedFields).


        """
        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get("info", frmt=frmt, headers=headers)
        return res

    def fetch(self, database, query, frmt='json'):
        """Retrieve particular records from a searchable fields

        Returned object is a json object with fields as in
        :attr:`stored_field`, which is returned from :meth:`get_info` method.

        Only one query at a time. No wild cards are accepted.
        ::

            >>> h = HGNC()
            >>> h.fetch('symbol', 'ZNF3')
            >>> h.fetch('alias_name', 'A-kinase anchor protein, 350kDa')
        """
        easydev.check_param_in_list(database, self.searchable_fields)
        url = 'fetch/{0}/{1}'.format(database, query)
        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get(url, frmt=frmt, headers=headers)
        return res

    def search(self, database_or_query=None, query=None, frmt='json'):
        """Search a searchable field (database) for a pattern

        The search request is more powerful than fetch for querying the
        database, but search will only returns the fields hgnc_id, symbol and
        score. This is because this tool is mainly intended to query the server
        to find possible entries of interest or to check data (such as your own
        symbols) rather than to fetch information about the genes. If you want
        to retrieve all the data for a set of genes from the search result, the
        user could use the hgnc_id returned by search to then fire off a fetch
        request by hgnc_id.

        :param database: if not provided, search all databases. 


        ::

            # Search all searchable fields for the tern BRAF
            h.search('BRAF')

            # Return all records that have symbols that start with ZNF
            h.search('symbol', 'ZNF*')

            # Return all records that have symbols that start with ZNF
            # followed by one and only one character (e.g. ZNF3)
            # Nov 2015 does not work neither here nor in within in the 
            # official documentation
            h.search('symbol', 'ZNF?')

            # search for symbols starting with ZNF that have been approved 
            # by HGNC
            h.search('symbol', 'ZNF*+AND+status:Approved')
            
            # return ZNF3 and ZNF12
            h.search('symbol', 'ZNF3+OR+ZNF12')

            # Return all records that have symbols that start with ZNF which 
            # are not approved (ie entry withdrawn)
            h.search('symbol', 'ZNF*+NOT+status:Approved')

        """
        if database_or_query is None and query is None:
            raise ValueError('you must provide at least one parameter')
        elif database_or_query is not None and query is None:
            # presumably user wants to search all databases
            query = database_or_query
            url = 'search/{0}'.format(query)
        else:
            database = database_or_query
            easydev.check_param_in_list(database, self.searchable_fields)
            url = 'search/{0}/{1}'.format(database, query)

        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get(url, frmt=frmt, headers=headers)
        return res
Пример #11
0
class EUtils(WSDLService):
    """Interface to `NCBI Entrez Utilities <http://www.ncbi.nlm.nih.gov/entrez/query/static/esoap_help.html>`_ service

    The EUtils class has a method called EFetch so this is actually covering
    all Entrez functionalities.

    Note that we use the WSDL protocol for all EUtils but we had to use the REST
    service in a few cases.

    .. warning:: Read the `guidelines
        <http://www.ncbi.nlm.nih.gov/books/NBK25497/>`_ before sending requests.
        No more than 3 requests per seconds otherwise your IP may be banned.
        You should provide your email by filling the :attr:`email` so that
        before being banned, you may be contacted.

    Here is an example on how to use :method:`EFetch` method to retrieve the
    FASTA sequence of a given identifier (34577063)::

        >>> from bioservices import EUtils
        >>> s = EUtils()
        >>> print(s.EFetch("sequences", "34577063", rettype="fasta"))
        >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens]
        MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV
        VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH
        QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE
        IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC
        TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH
        MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE
        LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF

    Most of the methods take a database name as input. You can obtain the
    valid list by checking the :attr:`databases` attribute.

    A few functions takes Identifier(s) as input. It could be a list of strings,
    list of numbers, or a string where identifiers are separated either by
    comma or spaces.

    A few functions takes an argument called **term**. You can use the **AND**
    keyword with spaces or + signs as separators::

        Correct:   term=biomol mrna[properties] AND mouse[organism]
        Correct:   term=biomol+mrna[properties]+AND+mouse[organism]

    Other special characters, such as quotation marks (") or the # symbol used
    in referring to a query key on the History server, could be represented by
    their URL encodings (%22 for "; %23 for #) or verbatim .::

        Correct: term=#2+AND+"gene in genomic"[properties]
        Correct: term=%232+AND+%22gene+in+genomic%22[properties]

    .. note:: most of the parameter names are identical to the expected names
        except for **id**, which has been replaced by **sid**.

    """
    def __init__(self, verbose=False, email="unknown"):
        #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"

        # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1
        # this url should be use
        url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?"
        super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url)

        warning = """

        NCBI recommends that users post no more than three URL requests per second.
        Failure to comply with this policy may result in an IP address being blocked
        from accessing NCBI. If NCBI blocks an IP address, service will not be
        restored unless the developers of the software accessing the E-utilities
        register values of the tool and email parameters with NCBI. The value of
        email will be used only to contact developers if NCBI observes requests
        that violate our policies, and we will attempt such contact prior to blocking
        access.  For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1

        BioServices does not check if you send more than 3 requests per seconds.
        This is considered to be the user responsability. Within BioServices, we
        fill the parameter **tool** and **email**, however, to fill the later
        you should provide your email either globablly when instanciating EUtils,
        or locally when calling a method.

        This message will not appear if you set the email as a parameter::

            e = EUtils(email="name@adress")

        or in you bioservices configuration file (.config/bioservices/bioservices.cfg)
        under linux with a user section::

            [user]
            email = yourname@somewhere


        """
        # on top of the WSDL protocol we also need a REST for the EFetch method
        # Indeed, although we have a WSDL class for EFetch, it is (i) limited
        # because doc could not be found (ii) required sn instanciation for
        # each database whereas with REST, we ca do it just once
        self._efetch = REST("Efetch",
                            "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")

        self._databases = None
        self.tool = "bioservices"
        self.email = email
        if self.email == "unknown":
            # trying the bioservices config file
            if self.settings.params['user.email'][0] != "unknown":
                self.email = self.settings.params['user.email'][0]
            else:
                self.logging.warning(warning)

    def _get_databases(self):
        """alias to run_eInfo"""
        if self._databases is None:
            # DbData changed into DbList in rev 1.3.0
            self._databases = sorted(self.serv.run_eInfo().DbList.DbName)
        return self._databases

    databases = property(_get_databases, doc="Returns list of valid databases")

    def _check_db(self, db):
        if db not in self.databases:
            raise ValueError("You must provide a valid databases from : ",
                             self.databases)

    def _check_retmode(self, retmode):
        if retmode not in ["xml", "text"]:
            raise ValueError("You must provide a retmode in 'xml', 'text'")

    def get_einfo_params(self, **kargs):
        return self.wsdl_create_factory("nsei:eInfoRequest", **kargs)

    def get_esummary_params(self, **kargs):
        return self.wsdl_create_factory("nsesu:eSummaryRequest", **kargs)

    def get_esearch_params(self, **kargs):
        return self.wsdl_create_factory("nsese:eSearchRequest", **kargs)

    def get_egquery_params(self, **kargs):
        return self.wsdl_create_factory("nseg:eGqueryRequest", **kargs)

    def get_espell_params(self, **kargs):
        return self.wsdl_create_factory("nsesp:eSpellRequest", **kargs)

    def get_elink_params(self, **kargs):
        return self.wsdl_create_factory("nsel:eLinkRequest", **kargs)

    def get_epost_params(self, **kargs):
        return self.wsdl_create_factory("nseps:ePostRequest", **kargs)

    def _check_ids(self, sid):
        if isinstance(sid, int):
            sid = [sid]
        if isinstance(sid, list):
            sid = ",".join([str(x) for x in sid])

        # If there are commas, let us split, strip spaces and join back the ids
        sid = ",".join([x.strip() for x in sid.split(',') if x.strip() != ""])

        if len(sid.split(",")) > 200:
            raise ValueError(
                "Number of comma separated IDs must be less than 200")
        return sid

    def taxonomy(self, sid, raw=False):
        """Alias to EFetch for ther taxonomy database using WSDL

        ::

            >>> s = EUtils()
            >>> ret = s.taxonomy("9606")
            >>> ret.Taxon.TaxId
            '9606'
            >>> ret.Taxon.ScientificName
            'H**o sapiens'
            >>> ret = s.taxonomy("9606,9605,111111111,9604")
            >>> ret.Taxon[2].TaxId
            '9604'


        """
        sid = self._check_ids(sid)
        serv = EFetch("taxon")
        ret = serv.efetch(sid)
        if raw:
            return ret
        else:
            return ret.TaxaSet

    def snp(self, sid):
        """Alias to Efetch for the SNP database using WSDL

        ::

            >>> s.snp("123")

        """
        serv = EFetch("snp")
        ret = serv.efetch(sid)
        return ret

    def EFetch(self, db, sid=None, retmode="text", **kargs):
        """Access to the EFetch E-Utilities

        :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database
            name . This is the destination database for the link operation.
        :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided.
            All of the UIDs must be from the database specified by db. Limited
            to 200 sid

        :param retmode: default to text (could be xml but not recommended).
        :param rettype: could be fasta, summar      :param rettype: could be
        fasta, summaryy

        ::

            >>> ret = s.EFetch("omim", "269840")  --> ZAP70
            >>> ret = s.EFetch("taxonomy", "9606", retmode="xml")
            >>> [x.text for x in ret.getchildren()[0].getchildren() if x.tag=="ScientificName"]
            ['H**o sapiens']

            >>> s = eutils.EUtils()
            >>> s.EFetch("sequences", "34577063", retmode="text", rettype="fasta")
            >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens]
            MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV
            VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH
            QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE
            IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC
            TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH
            MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE
            LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF


        Identifiers could be provided as a single string with comma-separated
        values, or a list of strings, a list of integers, or just one
        string or one integer but no mixing of types in the list::

            >>> e.EFetch("sequences", "352, 234", retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", 352, retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", [352], retmode="text", rettype="fasta")
            >>> e.EFetch("sequences", [352, 234], retmode="text", rettype="fasta")


        **retmode** should be xml or text depending on the database. For instance, xml fo
        pubmed::

            >>> e.EFetch("pubmed", "20210808", retmode="xml")
            >>> e.EFetch('nucleotide', id=15, retmode='xml')
            >>> e.EFetch('nucleotide', id=15, retmode='xml', rettype='fasta')
            >>> e.EFetch('nucleotide', 'NT_019265', rettype='gb')

        eutils.EUtilsParser(e.EFetch("taxonomy", "9685", retmode="xml")
        .. todo:: more documentation and optional arguments

        Other special characters, such as quotation marks (") or the # symbol
        used in referring to a query key on the History server, should be
        represented by their URL encodings (%22 for "; %23 for #).
        """
        #self._check_db(db)
        self._check_retmode(retmode)
        if sid is not None:
            sid = self._check_ids(sid)

        params = {
            'db': db,
            'id': sid,
            'retmode': retmode,
            'tool': self.tool,
            'email': self.email
        }
        if kargs.get("strand"):
            strand = kargs.get("strand")
            self.devtools.check_param_in_list(strand, [1, 2])
            params['strand'] = strand

        if kargs.get("complexity"):
            complexity = kargs.get("complexity")
            if complexity in [0, 1, 2, 3, 4]:
                params['complexity'] = complexity
            else:
                raise ValueError(
                    "invalid complexity. must be a number in 0,1,2,3,4")

        for param in [
                'retmax', 'seq_start', "seq_stop", "rettype", "query_key",
                "WebEnv"
        ]:
            if kargs.get(param):
                params[param] = kargs.get(param)

        #print(params)
        if retmode == "xml":
            ret = self._efetch.http_get("efetch.fcgi", 'xml', params=params)
            ret = self.easyXML(ret)
        else:
            ret = self._efetch.http_get("efetch.fcgi", 'txt', params=params)

        return ret

    def EInfo(self, db=None, **kargs):
        """Provides the number of records indexed in each field of a given
        database, the date of the last update of the database, and the available links
        from the database to other Entrez databases.

        :param str db: target database about which to gather statistics. Value must be a
            valid Entrez database name. See :attr:`databases` or don't provide
            any value to obtain the entire list
        :return: either a list of databases, or a dictionary with relevant information
            about the requested database

        ::

            >>> all_database_names = s.EInfo()
            >>> # specific info about one database:
            >>> ret = s.EInfo("taxonomy")
            >>> ret.Count
            >>> ret.Name
            >>> ret = s.EInfo('pubmed')
            >>> res.FieldList[2].FullName
            'Filter'

        """
        if db is None:
            return self.databases
        else:
            self._check_db(db)

        # WSDL does not work, let us use rest instead.
        ret = self._einfo_rest(db, **kargs)
        ret = EUtilsParser(ret)
        return ret

    def _einfo_rest(self, db=None, **kargs):
        s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        ret = s.http_get("einfo.fcgi?db=%s" % db,
                         frmt="xml",
                         params={
                             'tool': kargs.get('tool', self.tool),
                             'email': kargs.get('email', self.email)
                         })
        ret = self.easyXML(ret)
        return ret

    """Does not work...issue with DbBuil
    # ret = self._einfo_wsdl(db, **kargs)
    def _einfo_wsdl(self, db=None, **kargs):
        params = self.suds.factory.create("nsei:eInfoRequest", **kargs)
        params.db = db
        params.tool = self.tool[:]
        params.email = self.email[:]
        return self.serv.run_eInfo(db, params)
    """

    def ESummary(self, db, sid=None, **kargs):
        """Returns document summaries for a list of input UIDs


        :param str sid: list of identifiers (or string comma separated).
            all of the UIDs must be from the database specified by db. Limited
            to 200 sid

        ::

            >>> from bioservices import *
            >>> s = EUtils()
            >>> ret = s.ESummary("snp","7535")
            >>> ret = s.ESummary("snp","7535,7530")
            >>> ret = s.ESummary("taxonomy", "9606,9913")

        ::

            >>> proteins = e.ESearch("protein", "bacteriorhodopsin", RetMax=20,)
            >>> ret = e.ESummary("protein", proteins.IdList.Id[0])
            >>> ret.DocSum[0].Item[2]
            (ItemType){
               _Type = "String"
               _Name = "Extra"
               ItemContent = "gi|6320236|ref|NP_010316.1|[6320236]"
            }


        """
        if sid is not None:
            sid = self._check_ids(sid)

        if db is None:
            return self.databases
        else:
            self._check_db(db)

        params = self.get_esummary_params(**kargs)
        params.db = db
        params.id = sid
        ret = self.serv.run_eSummary(**dict(params))
        return ret

    def _esummary_rest(self, db, sid):
        # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]]
        s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None)
        ret = self.easyXML(ret)
        return ret

    def EGQuery(self, term, **kargs):
        """Provides the number of records retrieved in all Entrez databases by a text query.

        :param str term: Entrez text query. All special characters must be URL
            encoded. Spaces may be replaced by '+' signs. For very long queries (more than
            several hundred characters long), consider using an HTTP POST call. See the
            PubMed or Entrez help for information about search field descriptions and tags.
            Search fields and tags are database specific.

        ::

            >>> ret = s.EGQuery("asthma")
            >>> [(x.DbName, x.Count) for x in ret.eGQueryResult.ResultItem if x.Count!='0']

            >>> ret = s.EGQuery("asthma")
            >>> ret.eGQueryResult.ResultItem[0]
            >>> ret.Term

        """
        params = self.get_egquery_params(**kargs)
        ret = self.serv.run_eGquery(term, params)
        return ret

    def ESearch(self, db, term, **kargs):
        """Responds to a query in a given  database


        The response can be used later in ESummary, EFetch or ELink, along with
        the term translations of the query.

        :param db:
        :param term:

        .. note:: see :meth:`get_esearch_params` for the list of valid parameters.

        ::

            >>> ret = e.ESearch('protein', 'human', RetMax=5)
            >>> ret = e.ESearch('taxonomy', 'Staphylococcus aureus[all names]')
            >>> ret = e.ESearch('pubmed', "cokelaer AND BioServices")
            >>> # There is on identifier in the IdList (therefore the first element)
            >>> identifiers = e.pubmed(ret.IdList.Id)


        More complex requests can be used. We will not cover all the possiblities (see the
        NCBI website). Here is an example to tune the search term to look into
        PubMed for the journal PNAS Volume 16, and retrieve.::

            >>> e.ESearch("pubmed", "PNAS[ta] AND 16[vi]")


        You can then look more closely at a specific identifier using EFetch::

            >>> e = EFetch("pubmed")
            >>> e.efetch(identifiers)


        .. note:: valid parameters can be found by calling :meth:`get_esearch_params`
        """
        params = self.get_esearch_params(**kargs)
        params['db'] = db
        params['term'] = term
        # the API requires the db and term paramters to be provided
        # as positional arguments. The db and term attribute in the
        # params structure are just ignored. Note, however, that
        # the db and term parameter must also be provided in the params
        # dict so that other argument are also used... wierd
        ret = self.serv.run_eSearch(db, term, params)
        return ret

    #def _egquery_rest(self, term, retmode="xml"):
    #    self._check_retmode(retmode)
    #    s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
    #    ret = s.request("egquery.fcgi?term=%s&retmode=%s" % (term, retmode))
    #    return ret

    def ESpell(self, db, term, **kargs):
        """Retrieve spelling suggestions for a text query in a given database.

        :param str db: database to search. Value must be a valid Entrez
            database name (default = pubmed).
        :param str term: Entrez text query. All special characters must be
            URL encoded.

        ::

            >>> ret = e.ESpell(db="omim", term="aasthma+OR+alergy")
            >>> ret.Query
            'asthmaa OR alergies'
            >>> ret.CorrectedQuery
            'asthma or allergy'
            >>> ret = e.ESpell(db="pubmed", term="biosservices")
            >>> ret.CorrectedQuery
            bioservices


        .. note:: only WSDL protocol available
        """
        params = self.get_espell_params(**kargs)
        self._check_db(db)
        ret = self.serv.run_eSpell(db, term, params)
        return ret

    def ELink(self, dbfrom, sid=None, **kargs):
        """The Entrez links utility

        Responds to a list of UIDs in a given database with either a list of
        related UIDs (and relevancy scores) in the same database or a list of linked
        UIDs in another Entrez database; checks for the existence of a specified link
        from a list of one or more UIDs; creates a hyperlink to the primary LinkOut
        provider for a specific UID and database, or lists LinkOut URLs and attributes
        for multiple UIDs.

        :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database
            name. This is the destination database for the link operation.
        :param str dbfrom: Database containing the input UIDs. The value must be a
            valid Entrez database name (default = pubmed). This is the origin database of
            the link operation. If db and dbfrom are set to the same database value, then
            ELink will return computational neighbors within that database. Please see the
            full list of Entrez links for available computational neighbors. Computational
            neighbors have linknames that begin with dbname_dbname (examples:
            protein_protein, pcassay_pcassay_activityneighbor).
        :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided.
            All of the UIDs must be from the database specified by db. Limited  to 200 Ids
        :param str cmd: ELink command mode. The command mode specified which
            function ELink will perform. Some optional parameters only function for certain
            values of cmd (see http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink).
            Examples are neighbor, prlinks.

        ::

            >>> # Example: Find related articles to PMID 20210808
            >>> ret = s.ELink("pubmed", sid="20210808", cmd="neighbor_score")
            >>> ret.LinkSet[0].LinkSetDb[0].Link[0].Id


            # FIXME: change example values
            >>> s.Elink(dbfrom="nucleotide", db="protein",
                              id="48819,7140345")
            >>> s.Elink(dbfrom="nucleotide", db="protein",
                              id="48819,7140345")

            LinkSetDb, DbFrom , IdList

        .. todo:: remove LinkSet : there is only 1 set ?
        """
        if sid is not None:
            sid = self._check_ids(sid)
        self._check_db(dbfrom)
        if 'cmd' in kargs.keys():
            assert kargs['cmd'] in [
                "neighbor", "neighbor_score", "neighbor_history", "acheck",
                "llinks", "lcheck", "ncheck", "llinkslib", "prlinks"
            ]

        #s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
        #request = "elink.fcgi?db=%s&dbfrom=%s" % (db, dbfrom)
        #request += "&id=%s" % sid
        #request += "&cmd=%s" % cmd
        #ret = s.request(request)
        #return ret
        params = self.get_elink_params(**kargs)
        params.dbfrom = dbfrom
        params.id = sid

        ret = self.serv.run_eLink(**dict(params))
        return ret

    def EPost(self, db, sid, **kargs):
        """Accepts a list of UIDs from a given database,

        stores the set on the History Server, and responds with a query
        key and web environment for the uploaded dataset.

        :param str db: a valid database
        :param id: list of strings of strings


        """
        params = self.get_epost_params(**kargs)
        params.id = sid
        params.db = db
        ret = self.serv.run_ePost(**dict(params))
        return ret
Пример #12
0
 def _esummary_rest(self, db, sid):
     # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]]
     s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/")
     ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None)
     ret = self.easyXML(ret)
     return ret