def _einfo_rest(self, db=None, **kargs): s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml", params={'tool':kargs.get('tool',self.tool), 'email':kargs.get('email',self.email) }) ret = self.easyXML(ret) return ret
def _einfo_rest(self, db=None, **kargs): s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml", params={ 'tool': kargs.get('tool', self.tool), 'email': kargs.get('email', self.email) }) ret = self.easyXML(ret) return ret
class EUtils(WSDLService): """Interface to `NCBI Entrez Utilities <http://www.ncbi.nlm.nih.gov/entrez/query/static/esoap_help.html>`_ service The EUtils class has a method called EFetch so this is actually covering all Entrez functionalities. Note that we use the WSDL protocol for all EUtils but we had to use the REST service in a few cases. .. warning:: Read the `guidelines <http://www.ncbi.nlm.nih.gov/books/NBK25497/>`_ before sending requests. No more than 3 requests per seconds otherwise your IP may be banned. You should provide your email by filling the :attr:`email` so that before being banned, you may be contacted. Here is an example on how to use :method:`EFetch` method to retrieve the FASTA sequence of a given identifier (34577063):: >>> from bioservices import EUtils >>> s = EUtils() >>> print(s.EFetch("sequences", "34577063", rettype="fasta")) >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens] MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF Most of the methods take a database name as input. You can obtain the valid list by checking the :attr:`databases` attribute. A few functions takes Identifier(s) as input. It could be a list of strings, list of numbers, or a string where identifiers are separated either by comma or spaces. A few functions takes an argument called **term**. You can use the **AND** keyword with spaces or + signs as separators:: Correct: term=biomol mrna[properties] AND mouse[organism] Correct: term=biomol+mrna[properties]+AND+mouse[organism] Other special characters, such as quotation marks (") or the # symbol used in referring to a query key on the History server, could be represented by their URL encodings (%22 for "; %23 for #) or verbatim .:: Correct: term=#2+AND+"gene in genomic"[properties] Correct: term=%232+AND+%22gene+in+genomic%22[properties] .. note:: most of the parameter names are identical to the expected names except for **id**, which has been replaced by **sid**. """ def __init__(self, verbose=False, email="unknown"): #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?" # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1 # this url should be use url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?" super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url) warning = """ NCBI recommends that users post no more than three URL requests per second. Failure to comply with this policy may result in an IP address being blocked from accessing NCBI. If NCBI blocks an IP address, service will not be restored unless the developers of the software accessing the E-utilities register values of the tool and email parameters with NCBI. The value of email will be used only to contact developers if NCBI observes requests that violate our policies, and we will attempt such contact prior to blocking access. For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1 BioServices does not check if you send more than 3 requests per seconds. This is considered to be the user responsability. Within BioServices, we fill the parameter **tool** and **email**, however, to fill the later you should provide your email either globablly when instanciating EUtils, or locally when calling a method. This message will not appear if you set the email as a parameter:: e = EUtils(email="name@adress") or in you bioservices configuration file (.config/bioservices/bioservices.cfg) under linux with a user section:: [user] email = yourname@somewhere """ # on top of the WSDL protocol we also need a REST for the EFetch method # Indeed, although we have a WSDL class for EFetch, it is (i) limited # because doc could not be found (ii) required sn instanciation for # each database whereas with REST, we ca do it just once self._efetch = REST("Efetch","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") self._databases = None self.tool = "bioservices" self.email = email if self.email == "unknown": # trying the bioservices config file if self.settings.params['user.email'][0]!="unknown": self.email = self.settings.params['user.email'][0] else: self.logging.warning(warning) def _get_databases(self): """alias to run_eInfo""" if self._databases is None: # DbData changed into DbList in rev 1.3.0 self._databases = sorted(self.serv.run_eInfo().DbList.DbName) return self._databases databases = property(_get_databases, doc="Returns list of valid databases") def _check_db(self, db): if db not in self.databases: raise ValueError("You must provide a valid databases from : ", self.databases) def _check_retmode(self, retmode): if retmode not in ["xml", "text"]: raise ValueError("You must provide a retmode in 'xml', 'text'") def get_einfo_params(self, **kargs): return self.wsdl_create_factory("nsei:eInfoRequest", **kargs) def get_esummary_params(self, **kargs): return self.wsdl_create_factory("nsesu:eSummaryRequest", **kargs) def get_esearch_params(self, **kargs): return self.wsdl_create_factory("nsese:eSearchRequest", **kargs) def get_egquery_params(self, **kargs): return self.wsdl_create_factory("nseg:eGqueryRequest", **kargs) def get_espell_params(self, **kargs): return self.wsdl_create_factory("nsesp:eSpellRequest", **kargs) def get_elink_params(self, **kargs): return self.wsdl_create_factory("nsel:eLinkRequest", **kargs) def get_epost_params(self, **kargs): return self.wsdl_create_factory("nseps:ePostRequest", **kargs) def _check_ids(self, sid): if isinstance(sid, int): sid = [sid] if isinstance(sid, list): sid = ",".join([str(x) for x in sid]) # If there are commas, let us split, strip spaces and join back the ids sid = ",".join([x.strip() for x in sid.split(',') if x.strip()!=""]) if len(sid.split(","))>200: raise ValueError("Number of comma separated IDs must be less than 200") return sid def taxonomy(self, sid, raw=False): """Alias to EFetch for ther taxonomy database using WSDL :: >>> s = EUtils() >>> ret = s.taxonomy("9606") >>> ret.Taxon.TaxId '9606' >>> ret.Taxon.ScientificName 'H**o sapiens' >>> ret = s.taxonomy("9606,9605,111111111,9604") >>> ret.Taxon[2].TaxId '9604' """ sid = self._check_ids(sid) serv = EFetch("taxon") ret = serv.efetch(sid) if raw: return ret else: return ret.TaxaSet def snp(self, sid): """Alias to Efetch for the SNP database using WSDL :: >>> s.snp("123") """ serv = EFetch("snp") ret = serv.efetch(sid) return ret def EFetch(self, db, sid=None, retmode="text", **kargs): """Access to the EFetch E-Utilities :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database name . This is the destination database for the link operation. :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. Limited to 200 sid :param retmode: default to text (could be xml but not recommended). :param rettype: could be fasta, summar :param rettype: could be fasta, summaryy :: >>> ret = s.EFetch("omim", "269840") --> ZAP70 >>> ret = s.EFetch("taxonomy", "9606", retmode="xml") >>> [x.text for x in ret.getchildren()[0].getchildren() if x.tag=="ScientificName"] ['H**o sapiens'] >>> s = eutils.EUtils() >>> s.EFetch("sequences", "34577063", retmode="text", rettype="fasta") >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens] MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF Identifiers could be provided as a single string with comma-separated values, or a list of strings, a list of integers, or just one string or one integer but no mixing of types in the list:: >>> e.EFetch("sequences", "352, 234", retmode="text", rettype="fasta") >>> e.EFetch("sequences", 352, retmode="text", rettype="fasta") >>> e.EFetch("sequences", [352], retmode="text", rettype="fasta") >>> e.EFetch("sequences", [352, 234], retmode="text", rettype="fasta") **retmode** should be xml or text depending on the database. For instance, xml fo pubmed:: >>> e.EFetch("pubmed", "20210808", retmode="xml") >>> e.EFetch('nucleotide', id=15, retmode='xml') >>> e.EFetch('nucleotide', id=15, retmode='xml', rettype='fasta') >>> e.EFetch('nucleotide', 'NT_019265', rettype='gb') eutils.EUtilsParser(e.EFetch("taxonomy", "9685", retmode="xml") .. todo:: more documentation and optional arguments Other special characters, such as quotation marks (") or the # symbol used in referring to a query key on the History server, should be represented by their URL encodings (%22 for "; %23 for #). """ #self._check_db(db) self._check_retmode(retmode) if sid is not None: sid = self._check_ids(sid) params = {'db':db, 'id':sid, 'retmode':retmode, 'tool':self.tool, 'email': self.email} if kargs.get("strand"): strand = kargs.get("strand") self.devtools.check_param_in_list(strand, [1,2]) params['strand'] = strand if kargs.get("complexity"): complexity = kargs.get("complexity") if complexity in [0,1,2,3,4]: params['complexity'] = complexity else: raise ValueError("invalid complexity. must be a number in 0,1,2,3,4") for param in ['retmax', 'seq_start', "seq_stop", "rettype", "query_key", "WebEnv"]: if kargs.get(param): params[param] = kargs.get(param) #print(params) if retmode == "xml": ret = self._efetch.http_get("efetch.fcgi", 'xml', params=params) ret = self.easyXML(ret) else: ret = self._efetch.http_get("efetch.fcgi", 'txt', params=params) return ret def EInfo(self, db=None, **kargs): """Provides the number of records indexed in each field of a given database, the date of the last update of the database, and the available links from the database to other Entrez databases. :param str db: target database about which to gather statistics. Value must be a valid Entrez database name. See :attr:`databases` or don't provide any value to obtain the entire list :return: either a list of databases, or a dictionary with relevant information about the requested database :: >>> all_database_names = s.EInfo() >>> # specific info about one database: >>> ret = s.EInfo("taxonomy") >>> ret.Count >>> ret.Name >>> ret = s.EInfo('pubmed') >>> res.FieldList[2].FullName 'Filter' """ if db is None: return self.databases else: self._check_db(db) # WSDL does not work, let us use rest instead. ret = self._einfo_rest(db, **kargs) ret = EUtilsParser(ret) return ret def _einfo_rest(self, db=None, **kargs): s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml", params={'tool':kargs.get('tool',self.tool), 'email':kargs.get('email',self.email) }) ret = self.easyXML(ret) return ret """Does not work...issue with DbBuil # ret = self._einfo_wsdl(db, **kargs) def _einfo_wsdl(self, db=None, **kargs): params = self.suds.factory.create("nsei:eInfoRequest", **kargs) params.db = db params.tool = self.tool[:] params.email = self.email[:] return self.serv.run_eInfo(db, params) """ def ESummary(self, db, sid=None, **kargs): """Returns document summaries for a list of input UIDs :param str sid: list of identifiers (or string comma separated). all of the UIDs must be from the database specified by db. Limited to 200 sid :: >>> from bioservices import * >>> s = EUtils() >>> ret = s.ESummary("snp","7535") >>> ret = s.ESummary("snp","7535,7530") >>> ret = s.ESummary("taxonomy", "9606,9913") :: >>> proteins = e.ESearch("protein", "bacteriorhodopsin", RetMax=20,) >>> ret = e.ESummary("protein", proteins.IdList.Id[0]) >>> ret.DocSum[0].Item[2] (ItemType){ _Type = "String" _Name = "Extra" ItemContent = "gi|6320236|ref|NP_010316.1|[6320236]" } """ if sid is not None: sid = self._check_ids(sid) if db is None: return self.databases else: self._check_db(db) params = self.get_esummary_params(**kargs) params.db = db params.id = sid ret = self.serv.run_eSummary(**dict(params)) return ret def _esummary_rest(self, db, sid): # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]] s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None) ret = self.easyXML(ret) return ret def EGQuery(self, term, **kargs): """Provides the number of records retrieved in all Entrez databases by a text query. :param str term: Entrez text query. All special characters must be URL encoded. Spaces may be replaced by '+' signs. For very long queries (more than several hundred characters long), consider using an HTTP POST call. See the PubMed or Entrez help for information about search field descriptions and tags. Search fields and tags are database specific. :: >>> ret = s.EGQuery("asthma") >>> [(x.DbName, x.Count) for x in ret.eGQueryResult.ResultItem if x.Count!='0'] >>> ret = s.EGQuery("asthma") >>> ret.eGQueryResult.ResultItem[0] >>> ret.Term """ params = self.get_egquery_params(**kargs) ret = self.serv.run_eGquery(term, params) return ret def ESearch(self, db, term, **kargs): """Responds to a query in a given database The response can be used later in ESummary, EFetch or ELink, along with the term translations of the query. :param db: :param term: .. note:: see :meth:`get_esearch_params` for the list of valid parameters. :: >>> ret = e.ESearch('protein', 'human', RetMax=5) >>> ret = e.ESearch('taxonomy', 'Staphylococcus aureus[all names]') >>> ret = e.ESearch('pubmed', "cokelaer AND BioServices") >>> # There is on identifier in the IdList (therefore the first element) >>> identifiers = e.pubmed(ret.IdList.Id) More complex requests can be used. We will not cover all the possiblities (see the NCBI website). Here is an example to tune the search term to look into PubMed for the journal PNAS Volume 16, and retrieve.:: >>> e.ESearch("pubmed", "PNAS[ta] AND 16[vi]") You can then look more closely at a specific identifier using EFetch:: >>> e = EFetch("pubmed") >>> e.efetch(identifiers) .. note:: valid parameters can be found by calling :meth:`get_esearch_params` """ params = self.get_esearch_params(**kargs) params['db'] = db params['term'] = term # the API requires the db and term paramters to be provided # as positional arguments. The db and term attribute in the # params structure are just ignored. Note, however, that # the db and term parameter must also be provided in the params # dict so that other argument are also used... wierd ret = self.serv.run_eSearch(db, term, params) return ret #def _egquery_rest(self, term, retmode="xml"): # self._check_retmode(retmode) # s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") # ret = s.request("egquery.fcgi?term=%s&retmode=%s" % (term, retmode)) # return ret def ESpell(self, db, term, **kargs): """Retrieve spelling suggestions for a text query in a given database. :param str db: database to search. Value must be a valid Entrez database name (default = pubmed). :param str term: Entrez text query. All special characters must be URL encoded. :: >>> ret = e.ESpell(db="omim", term="aasthma+OR+alergy") >>> ret.Query 'asthmaa OR alergies' >>> ret.CorrectedQuery 'asthma or allergy' >>> ret = e.ESpell(db="pubmed", term="biosservices") >>> ret.CorrectedQuery bioservices .. note:: only WSDL protocol available """ params = self.get_espell_params(**kargs) self._check_db(db) ret = self.serv.run_eSpell(db, term, params) return ret def ELink(self, dbfrom, sid=None, **kargs): """The Entrez links utility Responds to a list of UIDs in a given database with either a list of related UIDs (and relevancy scores) in the same database or a list of linked UIDs in another Entrez database; checks for the existence of a specified link from a list of one or more UIDs; creates a hyperlink to the primary LinkOut provider for a specific UID and database, or lists LinkOut URLs and attributes for multiple UIDs. :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database name. This is the destination database for the link operation. :param str dbfrom: Database containing the input UIDs. The value must be a valid Entrez database name (default = pubmed). This is the origin database of the link operation. If db and dbfrom are set to the same database value, then ELink will return computational neighbors within that database. Please see the full list of Entrez links for available computational neighbors. Computational neighbors have linknames that begin with dbname_dbname (examples: protein_protein, pcassay_pcassay_activityneighbor). :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. Limited to 200 Ids :param str cmd: ELink command mode. The command mode specified which function ELink will perform. Some optional parameters only function for certain values of cmd (see http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink). Examples are neighbor, prlinks. :: >>> # Example: Find related articles to PMID 20210808 >>> ret = s.ELink("pubmed", sid="20210808", cmd="neighbor_score") >>> ret.LinkSet[0].LinkSetDb[0].Link[0].Id # FIXME: change example values >>> s.Elink(dbfrom="nucleotide", db="protein", id="48819,7140345") >>> s.Elink(dbfrom="nucleotide", db="protein", id="48819,7140345") LinkSetDb, DbFrom , IdList .. todo:: remove LinkSet : there is only 1 set ? """ if sid is not None: sid = self._check_ids(sid) self._check_db(dbfrom) if 'cmd' in kargs.keys(): assert kargs['cmd'] in ["neighbor", "neighbor_score", "neighbor_history", "acheck", "llinks", "lcheck", "ncheck", "llinkslib", "prlinks"] #s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") #request = "elink.fcgi?db=%s&dbfrom=%s" % (db, dbfrom) #request += "&id=%s" % sid #request += "&cmd=%s" % cmd #ret = s.request(request) #return ret params = self.get_elink_params(**kargs) params.dbfrom = dbfrom params.id = sid ret = self.serv.run_eLink(**dict(params)) return ret def EPost(self, db, sid, **kargs): """Accepts a list of UIDs from a given database, stores the set on the History Server, and responds with a query key and web environment for the uploaded dataset. :param str db: a valid database :param id: list of strings of strings """ params = self.get_epost_params(**kargs) params.id = sid params.db = db ret = self.serv.run_ePost(**dict(params)) return ret
def _esummary_rest(self, db, sid): # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]] s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None) ret = self.easyXML(ret) return ret
class PSICQUIC(): """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used the REST only. This service provides a common interface to more than 25 other services related to protein. So, we won't detail all the possiblity of this service. Here is an example that consists of looking for interactors of the protein ZAP70 within the IntAct database:: >>> from bioservices import * >>> s = PSICQUIC() >>> res = s.query("intact", "zap70") >>> len(res) # there are 11 interactions found 11 >>> for x in res[1]: ... print(x) uniprotkb:O95169 uniprotkb:P43403 intact:EBI-716238 intact:EBI-1211276 psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI . . Here we have a list of entries. There are 15 of them (depending on the *output* parameter). The meaning of the entries is described on PSICQUIC website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short: #. Unique identifier for interactor A #. Unique identifier for interactor B. #. Alternative identifier for interactor A, for example the official gene #. Alternative identifier for interactor B. #. Aliases for A, separated by "| #. Aliases for B. #. Interaction detection methods, taken from the corresponding PSI-MI #. First author surname(s) of the publication(s) #. Identifier of the publication #. NCBI Taxonomy identifier for interactor A. #. NCBI Taxonomy identifier for interactor B. #. Interaction types, #. Source databases and identifiers, #. Interaction identifier(s) i #. Confidence score. Denoted as scoreType:value. Another example with reactome database:: res = s.query("reactome", "Q9Y266") .. warning:: PSICQUIC gives access to 25 other services. We cannot create a dedicated parsing for all of them. So, the ::`query` method returns the raw data. Addition class may provide dedicated parsing in the future. .. seealso:: :class:`bioservices.biogrid.BioGRID` """ _formats = [ "tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml", "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle" ] # note the typo in "genbank indentifier from bind DB _mapping_uniprot = { "genbank indentifier": "P_GI", 'entrezgene/locuslink': "P_ENTREZGENEID", 'uniprotkb': "ACC+ID", 'rcsb pdb': "PDB_ID", 'ensembl': "ENSEMBL_ID", 'refseq': "P_REFSEQ_AC", 'hgnc': 'HGNC_ID', "kegg": "KEGG_ID", "entrez gene/locuslink": "P_ENTREZGENEID", "chembl": "CHEMBL_ID", "ddbj/embl/genbank": "EMBL_ID", "dip": "DIP_ID", "ensemblgenomes": "ENSEMBLGENOME_ID", "omim": "MIM_ID", "chebi": None, "chembl": None, # "intact": None } # unknown: hprd, omim, bind, bind complexid, mdl, def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ self.services = REST( "PSICQUIC", verbose=verbose, url="https://www.ebi.ac.uk/Tools/webservices/psicquic", url_defined_later=True) # this prevent annoying warning self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.services.logging.warning( "UniProt service could be be initialised") self.buffer = {} def _get_formats(self): return PSICQUIC._formats formats = property(_get_formats, doc="Returns the possible output formats") def _get_active_db(self): names = self.registry_names[:] actives = self.registry_actives[:] names = [x.lower() for x, y in zip(names, actives) if y == "true"] return names activeDBs = property(_get_active_db, doc="returns the active DBs only") def read_registry(self): """Reads and returns the active registry """ url = 'registry/registry?action=ACTIVE&format=txt' res = self.services.http_get(url, frmt='txt') return res.split() def print_status(self): """Prints the services that are available :return: Nothing The output is tabulated. The columns are: * names * active * count * version * rest URL * soap URL * rest example * restricted .. seealso:: If you want the data into lists, see all attributes starting with registry such as :meth:`registry_names` """ url = 'registry/registry?action=STATUS&format=xml' res = self.services.http_get(url, frmt="txt") names = self.registry_names counts = self.registry_counts versions = self.registry_versions actives = self.registry_actives resturls = self.registry_resturls soapurls = self.registry_soapurls restexs = self.registry_restexamples restricted = self.registry_restricted N = len(names) indices = sorted(range(0, N), key=lambda k: names[k]) for i in range(0, N): print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i])) # todo a property for the version of PISCQUIC def _get_registry(self): if self._registry is None: url = 'registry/registry?action=STATUS&format=xml' res = self.services.http_get(url, frmt="xml") res = self.services.easyXML(res) self._registry = res return self._registry registry = property(_get_registry, doc="returns the registry of psicquic") def _get_registry_names(self): res = self.registry return [x.findAll('name')[0].text for x in res.findAll("service")] registry_names = property(_get_registry_names, doc="returns all services available (names)") def _get_registry_restricted(self): res = self.registry return [ x.findAll('restricted')[0].text for x in res.findAll("service") ] registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services") def _get_registry_resturl(self): res = self.registry data = [x.findAll('resturl')[0].text for x in res.findAll("service")] return data registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services") def _get_registry_restex(self): res = self.registry data = [ x.findAll('restexample')[0].text for x in res.findAll("service") ] return data registry_restexamples = property( _get_registry_restex, doc="retuns REST example for each service") def _get_registry_soapurl(self): res = self.registry return [x.findAll('soapurl')[0].text for x in res.findAll("service")] registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service") def _get_registry_active(self): res = self.registry return [x.findAll('active')[0].text for x in res.findAll("service")] registry_actives = property(_get_registry_active, doc="returns active state of each service") def _get_registry_count(self): res = self.registry return [x.findAll('count')[0].text for x in res.findAll("service")] registry_counts = property(_get_registry_count, doc="returns number of entries in each service") def _get_registry_version(self): res = self.registry names = [x.findAll('name')[0].text for x in res.findAll("service")] N = len(names) version = [0] * N for i in range(0, N): x = res.findAll("service")[i] if x.findAll("version"): version[i] = x.findAll("version")[0].text else: version[i] = None return version registry_versions = property(_get_registry_version, doc="returns version of each service") def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None): """Send a query to a specific database :param str service: a registered service. See :attr:`registry_names`. :param str query: a valid query. Can be `*` or a protein name. :param str output: a valid format. See s._formats :: s.query("intact", "brca2", "tab27") s.query("intact", "zap70", "xml25") s.query("matrixdb", "*", "xml25") This is the programmatic approach to this website: http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml Another example consist in accessing the *string* database for fetching protein-protein interaction data of a particular model organism. Here we restrict the query to 100 results:: s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25") # spaces are automatically converted s.query("biogrid", "ZAP70 AND species:9606") .. warning:: AND must be in big caps. Some database are ore permissive than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more permissive and may accept the name (e.g., human) To obtain the number of interactions in intact for the human specy:: >>> len(p.query("intact", "species:9606")) """ if service not in self.activeDBs: raise ValueError("database %s not in active databases" % service) params = {} if output is not None: self.services.devtools.check_param_in_list(output, self.formats) params['format'] = output else: output = "none" names = [x.lower() for x in self.registry_names] try: index = names.index(service) except ValueError: self.logging.error( "The service you gave (%s) is not registered. See self.registery_names" % service) raise ValueError # get the base url according to the service requested resturl = self.registry_resturls[index] if firstResult is not None: params['firstResult'] = firstResult if maxResults is not None: params['maxResults'] = maxResults url = resturl + 'query/' + query if "xml" in output: res = self.services.http_get(url, frmt="xml", params=params) else: res = self.services.http_get(url, frmt="txt", params=params) res = res.strip().split("\n") if output.startswith("tab"): res = self._convert_tab2dict(res) return res def _convert_tab2dict(self, data): """ https://code.google.com/p/psicquic/wiki/MITAB26Format """ results = [] for line in data: results.append(line.split("\t")) return results def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None): """Same as query but runs on all active database :param list databases: database to query. Queries all active DB if not provided :return: dictionary where keys correspond to databases and values to the output of the query. :: res = s.queryAll("ZAP70 AND species:9606") """ results = {} if databases is None: databases = [x.lower() for x in self.activeDBs] for x in databases: if x not in self.activeDBs: raise ValueError("database %s not in active databases" % x) for name in databases: self.logging.warning("Querying %s" % name), res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults) if output.startswith("tab25"): results[name] = [x for x in res if x != [""]] else: import copy results[name] = copy.copy(res) for name in databases: self.logging.info("Found %s in %s" % (len(results[name]), name)) return results def getInteractionCounter(self, query): """Returns a dictionary with database as key and results as values :param str query: a valid query :return: a dictionary which key as database and value as number of entries Consider only the active database. """ # get the active names only activeDBs = self.activeDBs[:] res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs] return dict(res) def getName(self, data): idsA = [x[0] for x in data] idsB = [x[1] for x in data] return idsA, idsB def knownName(self, data): """Scan all entries (MITAB) and returns simplified version Each item in the input list of mitab entry The output is made of 2 lists corresponding to interactor A and B found in the mitab entries. elements in the input list takes the following forms:: DB1:ID1|DB2:ID2 DB3:ID3 The | sign separates equivalent IDs from different databases. We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known database is found, then we keep the first one whatsover. known databases are those available in the uniprot mapping tools. chembl and chebi IDs are kept unchanged. """ self.logging.info("converting data into known names") idsA = [x[0].replace("\"", "") for x in data] idsB = [x[1].replace("\"", "") for x in data] # extract the first and second ID but let us check if it is part of a # known uniprot mapping.Otherwise no conversion will be possible. # If so, we set the ID to "unknown" # remove the " character that can be found in a few cases (e.g, # chebi:"CHEBI:29036") #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA] #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB] # special case: # in mint, there is an entry that ends with a | uniprotkb:P17844| idsA = [x.strip("|") for x in idsA] idsB = [x.strip("|") for x in idsB] # the first ID for i, entry in enumerate(idsA): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db, ID) for db, ID in zip(dbs, IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs) >= 1: idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug( "none of the DB for this entry (%s) are available" % (entry)) idsA[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsA[ i] = "??:" + entry # we add a : so that we are sure that a split(":") will work # the second ID for i, entry in enumerate(idsB): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db, ID) for db, ID in zip(dbs, IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs) >= 1: idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug( "none of the DB (%s) for this entry are available" % (entry)) idsB[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsB[i] = "??:" + entry countA = len([x for x in idsA if x.startswith("?")]) countB = len([x for x in idsB if x.startswith("?")]) if countA + countB > 0: self.logging.warning("%s ids out of %s were not identified" % (countA + countB, len(idsA) * 2)) print(set([x.split(":")[0] for x in idsA if x.startswith("?")])) print(set([x.split(":")[0] for x in idsB if x.startswith("?")])) self.logging.info("knownName done") return idsA, idsB def preCleaning(self, data): """remove entries ehre IdA or IdB is set to "-" """ ret = [x for x in data if x[0] != "-" and x[1] != "-"] return ret def postCleaningAll(self, data, keep_only="HUMAN", flatten=True, verbose=True): """ even more cleaing by ignoring score, db and interaction len(set([(x[0],x[1]) for x in retnew])) """ results = {} for k in data.keys(): self.logging.info("Post cleaning %s" % k) ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose) if len(ret): results[k] = ret if flatten: results = [x for k in results.keys() for x in results[k]] return results def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi", "chembl"], keep_self_loop=False, verbose=True): """Remove entries with a None and keep only those with the keep pattern """ if verbose: print("Before removing anything: ", len(data)) data = [x for x in data if x[0] is not None and x[1] is not None] if verbose: print("After removing the None: ", len(data)) data = [ x for x in data if x[0].startswith("!") is False and x[1].startswith("!") is False ] if verbose: print("After removing the !: ", len(data)) for db in remove_db: data = [x for x in data if x[0].startswith(db) is False] data = [x for x in data if x[1].startswith(db) is False] if verbose: print("After removing entries that match %s : " % db, len(data)) data = [x for x in data if keep_only in x[0] and keep_only in x[1]] if verbose: print("After removing entries that don't match %s : " % keep_only, len(data)) if keep_self_loop is False: data = [x for x in data if x[0] != x[1]] if verbose: print("After removing self loop : ", len(data)) data = list(set(data)) if verbose: print("After removing identical entries", len(data)) return data def convertAll(self, data): results = {} for k in data.keys(): self.logging.info("Analysing %s" % k) results[k] = self.convert(data[k], db=k) return results def convert(self, data, db=None): self.logging.debug("converting the database %s" % db) idsA, idsB = self.knownName(data) mapping = self.mappingOneDB(data) results = [] for i, entry in enumerate(data): x = idsA[i].split(":", 1)[1] y = idsB[i].split(":", 1)[1] xp = mapping[x] yp = mapping[y] try: ref = entry[8] except: ref = "?" try: score = entry[14] except: score = "?" try: interaction = entry[11] except: interaction = "?" results.append((xp, yp, score, interaction, ref, db)) return results def mappingOneDB(self, data): query = {} self.logging.debug( "converting IDs with proper DB name (knownName function)") entriesA, entriesB = self.knownName( data ) # idsA and B contains list of a single identifier of the form db:id # the db is known from _mapping.uniprot otherwise it is called "unknown" # get unique DBs to build the query dictionary dbsA = [x.split(":")[0] for x in entriesA] dbsB = [x.split(":")[0] for x in entriesB] for x in set(dbsA): query[x] = set() for x in set(dbsB): query[x] = set() for k in query.keys(): if k.startswith("?"): del query[k] # the data to store mapping = {} N = len(data) # scan all entries counter = 0 for entryA, entryB in zip(entriesA, entriesB): counter += 1 dbA, idA = entryA.split(":") try: dbB, idB = entryB.split(":") except: print(entryB) if idA not in mapping.keys(): if dbA.startswith("?"): mapping[idA] = entryA else: query[dbA].add(idA) if idB not in mapping.keys(): if dbB.startswith("?"): mapping[idB] = entryB else: query[dbB].add(idB) for k in query.keys(): if len(query[k]) > 2000 or counter == N: this_query = list(query[k]) DBname = self._mapping_uniprot[k] if DBname is not None: self.logging.warning( "Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query)) for x in this_query: if x not in res: #was not found mapping[x] = "!" + k + ":" + x else: # we should be here since the queries are populated # if not already in the mapping dictionary if x not in res.keys(): raise ValueError(x) if len(res[x]) == 1: mapping[x] = res[x][0] else: self.logging.warning( "psicquic mapping found more than 1 id. keep first one" ) mapping[x] = res[x][0] else: for x in this_query: mapping[x] = k + ":" + x query[k] = set() for k in query.keys(): assert len(query[k]) == 0 return mapping
class HGNC(): """Wrapper to the genenames web service See details at http://www.genenames.org/help/rest-web-service-help """ def __init__(self, verbose=False, cache=False): url = "http://rest.genenames.org/" self.services = REST("HGNC", url=url, verbose=verbose, cache=cache) self._info = self.get_info() self.searchable_fields = self._info['searchableFields'] self.stored_fields = self._info['storedFields'] def get_info(self, frmt='json'): """Request information about the service Fields are when the server was last updated (lastModified), the number of documents (numDoc), which fields can be queried using search and fetch (searchableFields) and which fields may be returned by fetch (storedFields). """ headers = self.services.get_headers(content=frmt) res = self.services.http_get("info", frmt=frmt, headers=headers) return res def fetch(self, database, query, frmt='json'): """Retrieve particular records from a searchable fields Returned object is a json object with fields as in :attr:`stored_field`, which is returned from :meth:`get_info` method. Only one query at a time. No wild cards are accepted. :: >>> h = HGNC() >>> h.fetch('symbol', 'ZNF3') >>> h.fetch('alias_name', 'A-kinase anchor protein, 350kDa') """ easydev.check_param_in_list(database, self.searchable_fields) url = 'fetch/{0}/{1}'.format(database, query) headers = self.services.get_headers(content=frmt) res = self.services.http_get(url, frmt=frmt, headers=headers) return res def search(self, database_or_query=None, query=None, frmt='json'): """Search a searchable field (database) for a pattern The search request is more powerful than fetch for querying the database, but search will only returns the fields hgnc_id, symbol and score. This is because this tool is mainly intended to query the server to find possible entries of interest or to check data (such as your own symbols) rather than to fetch information about the genes. If you want to retrieve all the data for a set of genes from the search result, the user could use the hgnc_id returned by search to then fire off a fetch request by hgnc_id. :param database: if not provided, search all databases. :: # Search all searchable fields for the tern BRAF h.search('BRAF') # Return all records that have symbols that start with ZNF h.search('symbol', 'ZNF*') # Return all records that have symbols that start with ZNF # followed by one and only one character (e.g. ZNF3) # Nov 2015 does not work neither here nor in within in the # official documentation h.search('symbol', 'ZNF?') # search for symbols starting with ZNF that have been approved # by HGNC h.search('symbol', 'ZNF*+AND+status:Approved') # return ZNF3 and ZNF12 h.search('symbol', 'ZNF3+OR+ZNF12') # Return all records that have symbols that start with ZNF which # are not approved (ie entry withdrawn) h.search('symbol', 'ZNF*+NOT+status:Approved') """ if database_or_query is None and query is None: raise ValueError('you must provide at least one parameter') elif database_or_query is not None and query is None: # presumably user wants to search all databases query = database_or_query url = 'search/{0}'.format(query) else: database = database_or_query easydev.check_param_in_list(database, self.searchable_fields) url = 'search/{0}/{1}'.format(database, query) headers = self.services.get_headers(content=frmt) res = self.services.http_get(url, frmt=frmt, headers=headers) return res
class EUtils(WSDLService): """Interface to `NCBI Entrez Utilities <http://www.ncbi.nlm.nih.gov/entrez/query/static/esoap_help.html>`_ service The EUtils class has a method called EFetch so this is actually covering all Entrez functionalities. Note that we use the WSDL protocol for all EUtils but we had to use the REST service in a few cases. .. warning:: Read the `guidelines <http://www.ncbi.nlm.nih.gov/books/NBK25497/>`_ before sending requests. No more than 3 requests per seconds otherwise your IP may be banned. You should provide your email by filling the :attr:`email` so that before being banned, you may be contacted. Here is an example on how to use :method:`EFetch` method to retrieve the FASTA sequence of a given identifier (34577063):: >>> from bioservices import EUtils >>> s = EUtils() >>> print(s.EFetch("sequences", "34577063", rettype="fasta")) >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens] MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF Most of the methods take a database name as input. You can obtain the valid list by checking the :attr:`databases` attribute. A few functions takes Identifier(s) as input. It could be a list of strings, list of numbers, or a string where identifiers are separated either by comma or spaces. A few functions takes an argument called **term**. You can use the **AND** keyword with spaces or + signs as separators:: Correct: term=biomol mrna[properties] AND mouse[organism] Correct: term=biomol+mrna[properties]+AND+mouse[organism] Other special characters, such as quotation marks (") or the # symbol used in referring to a query key on the History server, could be represented by their URL encodings (%22 for "; %23 for #) or verbatim .:: Correct: term=#2+AND+"gene in genomic"[properties] Correct: term=%232+AND+%22gene+in+genomic%22[properties] .. note:: most of the parameter names are identical to the expected names except for **id**, which has been replaced by **sid**. """ def __init__(self, verbose=False, email="unknown"): #url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?" # according to http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1 # this url should be use url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/eutils.wsdl?" super(EUtils, self).__init__(name="EUtils", verbose=verbose, url=url) warning = """ NCBI recommends that users post no more than three URL requests per second. Failure to comply with this policy may result in an IP address being blocked from accessing NCBI. If NCBI blocks an IP address, service will not be restored unless the developers of the software accessing the E-utilities register values of the tool and email parameters with NCBI. The value of email will be used only to contact developers if NCBI observes requests that violate our policies, and we will attempt such contact prior to blocking access. For more details see http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.chapter2_table1 BioServices does not check if you send more than 3 requests per seconds. This is considered to be the user responsability. Within BioServices, we fill the parameter **tool** and **email**, however, to fill the later you should provide your email either globablly when instanciating EUtils, or locally when calling a method. This message will not appear if you set the email as a parameter:: e = EUtils(email="name@adress") or in you bioservices configuration file (.config/bioservices/bioservices.cfg) under linux with a user section:: [user] email = yourname@somewhere """ # on top of the WSDL protocol we also need a REST for the EFetch method # Indeed, although we have a WSDL class for EFetch, it is (i) limited # because doc could not be found (ii) required sn instanciation for # each database whereas with REST, we ca do it just once self._efetch = REST("Efetch", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") self._databases = None self.tool = "bioservices" self.email = email if self.email == "unknown": # trying the bioservices config file if self.settings.params['user.email'][0] != "unknown": self.email = self.settings.params['user.email'][0] else: self.logging.warning(warning) def _get_databases(self): """alias to run_eInfo""" if self._databases is None: # DbData changed into DbList in rev 1.3.0 self._databases = sorted(self.serv.run_eInfo().DbList.DbName) return self._databases databases = property(_get_databases, doc="Returns list of valid databases") def _check_db(self, db): if db not in self.databases: raise ValueError("You must provide a valid databases from : ", self.databases) def _check_retmode(self, retmode): if retmode not in ["xml", "text"]: raise ValueError("You must provide a retmode in 'xml', 'text'") def get_einfo_params(self, **kargs): return self.wsdl_create_factory("nsei:eInfoRequest", **kargs) def get_esummary_params(self, **kargs): return self.wsdl_create_factory("nsesu:eSummaryRequest", **kargs) def get_esearch_params(self, **kargs): return self.wsdl_create_factory("nsese:eSearchRequest", **kargs) def get_egquery_params(self, **kargs): return self.wsdl_create_factory("nseg:eGqueryRequest", **kargs) def get_espell_params(self, **kargs): return self.wsdl_create_factory("nsesp:eSpellRequest", **kargs) def get_elink_params(self, **kargs): return self.wsdl_create_factory("nsel:eLinkRequest", **kargs) def get_epost_params(self, **kargs): return self.wsdl_create_factory("nseps:ePostRequest", **kargs) def _check_ids(self, sid): if isinstance(sid, int): sid = [sid] if isinstance(sid, list): sid = ",".join([str(x) for x in sid]) # If there are commas, let us split, strip spaces and join back the ids sid = ",".join([x.strip() for x in sid.split(',') if x.strip() != ""]) if len(sid.split(",")) > 200: raise ValueError( "Number of comma separated IDs must be less than 200") return sid def taxonomy(self, sid, raw=False): """Alias to EFetch for ther taxonomy database using WSDL :: >>> s = EUtils() >>> ret = s.taxonomy("9606") >>> ret.Taxon.TaxId '9606' >>> ret.Taxon.ScientificName 'H**o sapiens' >>> ret = s.taxonomy("9606,9605,111111111,9604") >>> ret.Taxon[2].TaxId '9604' """ sid = self._check_ids(sid) serv = EFetch("taxon") ret = serv.efetch(sid) if raw: return ret else: return ret.TaxaSet def snp(self, sid): """Alias to Efetch for the SNP database using WSDL :: >>> s.snp("123") """ serv = EFetch("snp") ret = serv.efetch(sid) return ret def EFetch(self, db, sid=None, retmode="text", **kargs): """Access to the EFetch E-Utilities :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database name . This is the destination database for the link operation. :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. Limited to 200 sid :param retmode: default to text (could be xml but not recommended). :param rettype: could be fasta, summar :param rettype: could be fasta, summaryy :: >>> ret = s.EFetch("omim", "269840") --> ZAP70 >>> ret = s.EFetch("taxonomy", "9606", retmode="xml") >>> [x.text for x in ret.getchildren()[0].getchildren() if x.tag=="ScientificName"] ['H**o sapiens'] >>> s = eutils.EUtils() >>> s.EFetch("sequences", "34577063", retmode="text", rettype="fasta") >gi|34577063|ref|NP_001117.2| adenylosuccinate synthetase isozyme 2 [H**o sapiens] MAFAETYPAASSLPNGDCGRPRARPGGNRVTVVLGAQWGDEGKGKVVDLLAQDADIVCRCQGGNNAGHTV VVDSVEYDFHLLPSGIINPNVTAFIGNGVVIHLPGLFEEAEKNVQKGKGLEGWEKRLIISDRAHIVFDFH QAADGIQEQQRQEQAGKNLGTTKKGIGPVYSSKAARSGLRMCDLVSDFDGFSERFKVLANQYKSIYPTLE IDIEGELQKLKGYMEKIKPMVRDGVYFLYEALHGPPKKILVEGANAALLDIDFGTYPFVTSSNCTVGGVC TGLGMPPQNVGEVYGVVKAYTTRVGIGAFPTEQDNEIGELLQTRGREFGVTTGRKRRCGWLDLVLLKYAH MINGFTALALTKLDILDMFTEIKVGVAYKLDGEIIPHIPANQEVLNKVEVQYKTLPGWNTDISNARAFKE LPVNAQNYVRFIEDELQIPVKWIGVGKSRESMIQLF Identifiers could be provided as a single string with comma-separated values, or a list of strings, a list of integers, or just one string or one integer but no mixing of types in the list:: >>> e.EFetch("sequences", "352, 234", retmode="text", rettype="fasta") >>> e.EFetch("sequences", 352, retmode="text", rettype="fasta") >>> e.EFetch("sequences", [352], retmode="text", rettype="fasta") >>> e.EFetch("sequences", [352, 234], retmode="text", rettype="fasta") **retmode** should be xml or text depending on the database. For instance, xml fo pubmed:: >>> e.EFetch("pubmed", "20210808", retmode="xml") >>> e.EFetch('nucleotide', id=15, retmode='xml') >>> e.EFetch('nucleotide', id=15, retmode='xml', rettype='fasta') >>> e.EFetch('nucleotide', 'NT_019265', rettype='gb') eutils.EUtilsParser(e.EFetch("taxonomy", "9685", retmode="xml") .. todo:: more documentation and optional arguments Other special characters, such as quotation marks (") or the # symbol used in referring to a query key on the History server, should be represented by their URL encodings (%22 for "; %23 for #). """ #self._check_db(db) self._check_retmode(retmode) if sid is not None: sid = self._check_ids(sid) params = { 'db': db, 'id': sid, 'retmode': retmode, 'tool': self.tool, 'email': self.email } if kargs.get("strand"): strand = kargs.get("strand") self.devtools.check_param_in_list(strand, [1, 2]) params['strand'] = strand if kargs.get("complexity"): complexity = kargs.get("complexity") if complexity in [0, 1, 2, 3, 4]: params['complexity'] = complexity else: raise ValueError( "invalid complexity. must be a number in 0,1,2,3,4") for param in [ 'retmax', 'seq_start', "seq_stop", "rettype", "query_key", "WebEnv" ]: if kargs.get(param): params[param] = kargs.get(param) #print(params) if retmode == "xml": ret = self._efetch.http_get("efetch.fcgi", 'xml', params=params) ret = self.easyXML(ret) else: ret = self._efetch.http_get("efetch.fcgi", 'txt', params=params) return ret def EInfo(self, db=None, **kargs): """Provides the number of records indexed in each field of a given database, the date of the last update of the database, and the available links from the database to other Entrez databases. :param str db: target database about which to gather statistics. Value must be a valid Entrez database name. See :attr:`databases` or don't provide any value to obtain the entire list :return: either a list of databases, or a dictionary with relevant information about the requested database :: >>> all_database_names = s.EInfo() >>> # specific info about one database: >>> ret = s.EInfo("taxonomy") >>> ret.Count >>> ret.Name >>> ret = s.EInfo('pubmed') >>> res.FieldList[2].FullName 'Filter' """ if db is None: return self.databases else: self._check_db(db) # WSDL does not work, let us use rest instead. ret = self._einfo_rest(db, **kargs) ret = EUtilsParser(ret) return ret def _einfo_rest(self, db=None, **kargs): s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("einfo.fcgi?db=%s" % db, frmt="xml", params={ 'tool': kargs.get('tool', self.tool), 'email': kargs.get('email', self.email) }) ret = self.easyXML(ret) return ret """Does not work...issue with DbBuil # ret = self._einfo_wsdl(db, **kargs) def _einfo_wsdl(self, db=None, **kargs): params = self.suds.factory.create("nsei:eInfoRequest", **kargs) params.db = db params.tool = self.tool[:] params.email = self.email[:] return self.serv.run_eInfo(db, params) """ def ESummary(self, db, sid=None, **kargs): """Returns document summaries for a list of input UIDs :param str sid: list of identifiers (or string comma separated). all of the UIDs must be from the database specified by db. Limited to 200 sid :: >>> from bioservices import * >>> s = EUtils() >>> ret = s.ESummary("snp","7535") >>> ret = s.ESummary("snp","7535,7530") >>> ret = s.ESummary("taxonomy", "9606,9913") :: >>> proteins = e.ESearch("protein", "bacteriorhodopsin", RetMax=20,) >>> ret = e.ESummary("protein", proteins.IdList.Id[0]) >>> ret.DocSum[0].Item[2] (ItemType){ _Type = "String" _Name = "Extra" ItemContent = "gi|6320236|ref|NP_010316.1|[6320236]" } """ if sid is not None: sid = self._check_ids(sid) if db is None: return self.databases else: self._check_db(db) params = self.get_esummary_params(**kargs) params.db = db params.id = sid ret = self.serv.run_eSummary(**dict(params)) return ret def _esummary_rest(self, db, sid): # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]] s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None) ret = self.easyXML(ret) return ret def EGQuery(self, term, **kargs): """Provides the number of records retrieved in all Entrez databases by a text query. :param str term: Entrez text query. All special characters must be URL encoded. Spaces may be replaced by '+' signs. For very long queries (more than several hundred characters long), consider using an HTTP POST call. See the PubMed or Entrez help for information about search field descriptions and tags. Search fields and tags are database specific. :: >>> ret = s.EGQuery("asthma") >>> [(x.DbName, x.Count) for x in ret.eGQueryResult.ResultItem if x.Count!='0'] >>> ret = s.EGQuery("asthma") >>> ret.eGQueryResult.ResultItem[0] >>> ret.Term """ params = self.get_egquery_params(**kargs) ret = self.serv.run_eGquery(term, params) return ret def ESearch(self, db, term, **kargs): """Responds to a query in a given database The response can be used later in ESummary, EFetch or ELink, along with the term translations of the query. :param db: :param term: .. note:: see :meth:`get_esearch_params` for the list of valid parameters. :: >>> ret = e.ESearch('protein', 'human', RetMax=5) >>> ret = e.ESearch('taxonomy', 'Staphylococcus aureus[all names]') >>> ret = e.ESearch('pubmed', "cokelaer AND BioServices") >>> # There is on identifier in the IdList (therefore the first element) >>> identifiers = e.pubmed(ret.IdList.Id) More complex requests can be used. We will not cover all the possiblities (see the NCBI website). Here is an example to tune the search term to look into PubMed for the journal PNAS Volume 16, and retrieve.:: >>> e.ESearch("pubmed", "PNAS[ta] AND 16[vi]") You can then look more closely at a specific identifier using EFetch:: >>> e = EFetch("pubmed") >>> e.efetch(identifiers) .. note:: valid parameters can be found by calling :meth:`get_esearch_params` """ params = self.get_esearch_params(**kargs) params['db'] = db params['term'] = term # the API requires the db and term paramters to be provided # as positional arguments. The db and term attribute in the # params structure are just ignored. Note, however, that # the db and term parameter must also be provided in the params # dict so that other argument are also used... wierd ret = self.serv.run_eSearch(db, term, params) return ret #def _egquery_rest(self, term, retmode="xml"): # self._check_retmode(retmode) # s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") # ret = s.request("egquery.fcgi?term=%s&retmode=%s" % (term, retmode)) # return ret def ESpell(self, db, term, **kargs): """Retrieve spelling suggestions for a text query in a given database. :param str db: database to search. Value must be a valid Entrez database name (default = pubmed). :param str term: Entrez text query. All special characters must be URL encoded. :: >>> ret = e.ESpell(db="omim", term="aasthma+OR+alergy") >>> ret.Query 'asthmaa OR alergies' >>> ret.CorrectedQuery 'asthma or allergy' >>> ret = e.ESpell(db="pubmed", term="biosservices") >>> ret.CorrectedQuery bioservices .. note:: only WSDL protocol available """ params = self.get_espell_params(**kargs) self._check_db(db) ret = self.serv.run_eSpell(db, term, params) return ret def ELink(self, dbfrom, sid=None, **kargs): """The Entrez links utility Responds to a list of UIDs in a given database with either a list of related UIDs (and relevancy scores) in the same database or a list of linked UIDs in another Entrez database; checks for the existence of a specified link from a list of one or more UIDs; creates a hyperlink to the primary LinkOut provider for a specific UID and database, or lists LinkOut URLs and attributes for multiple UIDs. :param str db: Database from which to retrieve UIDs. The value must be a valid Entrez database name. This is the destination database for the link operation. :param str dbfrom: Database containing the input UIDs. The value must be a valid Entrez database name (default = pubmed). This is the origin database of the link operation. If db and dbfrom are set to the same database value, then ELink will return computational neighbors within that database. Please see the full list of Entrez links for available computational neighbors. Computational neighbors have linknames that begin with dbname_dbname (examples: protein_protein, pcassay_pcassay_activityneighbor). :param str sid: UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. Limited to 200 Ids :param str cmd: ELink command mode. The command mode specified which function ELink will perform. Some optional parameters only function for certain values of cmd (see http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink). Examples are neighbor, prlinks. :: >>> # Example: Find related articles to PMID 20210808 >>> ret = s.ELink("pubmed", sid="20210808", cmd="neighbor_score") >>> ret.LinkSet[0].LinkSetDb[0].Link[0].Id # FIXME: change example values >>> s.Elink(dbfrom="nucleotide", db="protein", id="48819,7140345") >>> s.Elink(dbfrom="nucleotide", db="protein", id="48819,7140345") LinkSetDb, DbFrom , IdList .. todo:: remove LinkSet : there is only 1 set ? """ if sid is not None: sid = self._check_ids(sid) self._check_db(dbfrom) if 'cmd' in kargs.keys(): assert kargs['cmd'] in [ "neighbor", "neighbor_score", "neighbor_history", "acheck", "llinks", "lcheck", "ncheck", "llinkslib", "prlinks" ] #s = REST("test","http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") #request = "elink.fcgi?db=%s&dbfrom=%s" % (db, dbfrom) #request += "&id=%s" % sid #request += "&cmd=%s" % cmd #ret = s.request(request) #return ret params = self.get_elink_params(**kargs) params.dbfrom = dbfrom params.id = sid ret = self.serv.run_eLink(**dict(params)) return ret def EPost(self, db, sid, **kargs): """Accepts a list of UIDs from a given database, stores the set on the History Server, and responds with a query key and web environment for the uploaded dataset. :param str db: a valid database :param id: list of strings of strings """ params = self.get_epost_params(**kargs) params.id = sid params.db = db ret = self.serv.run_ePost(**dict(params)) return ret
def _esummary_rest(self, db, sid): # [(x.attrib['Name'], x.text) for x in ret.getchildren()[0].getchildren()[1:]] s = REST("test", "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/") ret = s.http_get("esummary.fcgi?db=%s&id=%s" % (db, sid), None) ret = self.easyXML(ret) return ret