def Term(self, goid, frmt="oboxml"): """ Obtain Term information :param goid: go id to retrieve :param str frmt: the output format (mini, obo, oboxml). The format can be: * mini: Mini HTML, suitable for dynamically embedding in popup boxes. * obo: OBO format snippet. * oboxml: OBO XML format snippet. import QuickGO s = QuickGO() s.Term("GO:0003824") """ check_param_in_list(frmt, ["mini", "obo", "oboxml"]) if goid.startswith("GO:") is False: raise ValueError("GO id must start with 'GO:'") params = {'id': goid, 'format': frmt} res = self.http_get("GTerm", frmt="xml", params=params) return res
def smiles_query(self, query, search_type='exact', similarity=None): __valid_search_type = [ 'exact', 'substructure', 'superstructure', 'similarity' ] check_param_in_list(search_type, __valid_search_type) params = {'smiles': query, 'seach_type': search_type} if similarity is not None: check_range(similarity, 0, 1) params['similarity'] = similarity url = 'smilesQuery' res = self.http_get(query=url, frmt='xml', params=params) res = self.easyXML(res) return res
def retrieve(self, uniprot_id, frmt="xml"): """ Search for a uniprot ID in UniprotKB database u = UniProt() res = u.retrieve("P09958", frmt="xml") fasta = u.retrieve([u'P29317', u'Q5BKX8', u'Q8TCD6'], frmt='fasta') print(fasta[0]) :param uniprot_id: :param frmt: """ _valid_formats = ['txt', 'xml', 'rdf', 'gff', 'fasta'] check_param_in_list(frmt, _valid_formats) queries = tolist(uniprot_id) url = ["uniprot/" + query + '.' + frmt for query in queries] res = self.http_get(url, frmt="txt", headers=self.__headers) if frmt == "xml": res = [self.easyXML(x) for x in res] if isinstance(res, list) and len(res) == 1: res = res[0] return res
def EFetch(self, db, id, retmode='text', **kwargs): """ Return formatted data records for a list of input id :param retmode: text, xml not recommended :param db: Database from which to retrieve UIDs, must be a valid entrez database :param id: UID list, limited to 200 :param kwargs: rettype, could be fasta, summar """ _valid_opt_param = [ 'query_key', 'WebEnv', 'retmode', 'rettype', 'retstart', 'retmax', 'strand', 'seq_start', 'seq_stop', 'complexity' ] params = { 'db': db, 'id': id, 'retmode': retmode, 'tool': self.tool, 'email': self.email } url = 'efetch.fcgi' for key, value in kwargs.items(): if key in _valid_opt_param: if key is 'strand': check_param_in_list(value, [1, 2]) params[key] = value else: raise ValueError('Strand must be 0 or 1') if key is 'complexity': check_param_in_list(value, [0, 1, 2, 3, 4]) params[key] = value else: raise ValueError( "invalid complexity. must be a number in 0,1,2,3,4") params[key] = value res = self.http_get(url, frmt=retmode, params=params) return res
def Annotation(self, goid=None, protein=None, frmt="tsv", limit=10000, gz=False, col=None, db=None, aspect=None, termUse=None, evidence=None, source=None, ref=None, tax=9606, qualifier=None): """ Calling the Annotation service Mutual exclusive parameters are goid, protein :param col: This parameter, which is currently only applicable to the tsv download format, allows you to specify a comma-separated list of columns that you want to be included in the returned data set. The list below shows the available column names; clicking on the name of a column will take you to the description of the column in the QuickGO help file. The default set of columns is shown in bold text. :param tax: NCBI taxonomic identifer of annotated protein :param protein: Specifies one or more sequence identifiers or accessions from available database(s) (see DB filter column) :param limit: download limit (number of lines) (default 10,000 rows, which may not be sufficient for the data set that you are downloading. To bypass this default, and return the entire data set, specify a limit of -1). :param frmt: one of "gaf", "gene2go", "proteinList", "fasta", "tsv" or "dict". The "dict" argument is the default and is a python dictionary. :param gz: gzips the downloaded file. :param goid: GO identifiers either directly or indirectly (descendant GO identifiers) applied in annotations. :param aspect: use this to limit the annotations returned to a specific ontology or ontologies (Molecular Function, Biological Process or Cellular Component). The valid character can be F,P,C. :param termUse: if you set this parameter to slim, then QuickGO will use the supplied set of GO identifiers as a slim and will map the annotations up to these terms. See here for more details: http://www.ebi.ac.uk/QuickGO/GMultiTerm :param db: protein database (identifier type). Can be UniProtKB, UniGene, Ensembl. :param evidence: annotation evidence code category (Ev). Example of valid evidence are: be IDA, IC, ISS, IEA, IPI, ND, IMP, ISO, IGI should be either a string with comma separated values (e.g., IEA,IDA) or a list of strings (e.g. ["IEA","IDA"]). :param source: annotation provider. Examples are 'InterPro', 'UniPathway', 'MGI', 'FlyBase', 'GOC', 'Source', 'UniProtKB', 'RGD', 'ENSEMBL', 'ZFIN', 'IntAct'. :param ref: PubMed or GO reference supporting annotation. Can refer to a specific reference identifier or category (for category level, use `*` after ref type). Can be 'PUBMED:`*`', 'GO_REF:0000002'. :param qualifier: tags that modify the interpretation of an annotation. Examples are NOT, colocalizes_with, contributes_to. * Any number of fields can be specified; they will be AND'ed together. * Any number of values can be specified for each field; they will be OR'ed together. * Values should be URI encoded. print s.Annotation(protein='P12345', frmt='tsv', col="ref,evidence", ... ref='PMID:*') print s.Annotation(protein='P12345,Q4VCS5', frmt='tsv', ... col="ref,evidence",ref='PMID:,Reactome:') """ _valid_formats = ["gaf", "gpa", "gene2go", "proteinList", "fasta", "tsv"] _valid_db = ['UniProtKB', 'UniGene', 'Ensembl'] _valid_aspect = ['P', 'F', 'C'] check_param_in_list(frmt, _valid_formats) if isinstance(limit, int) is False: raise TypeError("limit parameter must be an integer greater than zero") # fill params with parameters that have default values. params = {'format': frmt, 'limit': limit} # beginning of the URL url = "GAnnotation?" # what is the ID being provided. We can have only one of: # protein, goid if protein is not None: url += "protein=" + protein elif goid is not None: url += "goid=" + goid elif tax is not None: url += "tax=" + str(tax) # need to check that there are mutualy exclusive if goid is None and protein is None and tax is None: raise ValueError("you must provide at least one of the following parameter: goid, protein") if aspect is not None: check_param_in_list(aspect, _valid_aspect) params['aspect'] = aspect if db is not None: check_param_in_list(db, _valid_db) params['db'] = db if termUse is not None: check_param_in_list(termUse, ["slim"]) params['termUse'] = termUse if evidence: if isinstance(evidence, list): evidence = ",".join([x.strip() for x in evidence]) elif isinstance(evidence, str): pass else: raise ValueError("Invalid parameter: evidence parameters must be a list of strings ['IDA','IEA'] or a " "string (e.g., 'IDA', 'IDA,IEA')") params['evidence'] = evidence if source: if isinstance(source, list): source = ",".join([x.strip() for x in source]) elif isinstance(source, str): pass else: raise ValueError("Invalid parameter: source parameters must be a list of strings ['UniProtKB'] or a " "string (e.g., 'UniProtKB')") params['source'] = source if ref: if isinstance(ref, list): ref = ",".join([x.strip() for x in ref]) elif isinstance(ref, str): pass else: raise ValueError("Invalid parameter: source parameters must be a list of strings ['PUBMED'] or a string " "(e.g., 'PUBMED:*') ") params['ref'] = ref if qualifier: # NOT, colocalizes_with, contributes_to if isinstance(qualifier, list): qualifier = ",".join([x.strip() for x in qualifier]) elif isinstance(qualifier, str): pass params['qualifier'] = qualifier # col parameter if frmt == "tsv": if col is None: col = 'proteinDB,proteinID,proteinSymbol,qualifier,' col += 'goID,goName,aspect,evidence,ref,with,proteinTaxon,' col += 'date,from,splice,proteinName,proteinSynonym,proteinType,' col += 'proteinTaxonName,originalTermID,originalGOName' else: col = ",".join([x.strip() for x in col.split(",")]) for c in col.split(','): check_param_in_list(c, self._valid_col) params["col"] = col if frmt not in ["tsv", "dict"]: # col is provided but format is not appropriate if col is not None: raise ValueError("You provided the 'col' parameter but the format is not correct. You should use the " "frmt='tsv' or frmt='dict' ") # gz parameter. do not expect values so need to be added afterwards. if gz is True: url += '&gz' res = self.http_get(url, frmt="txt", params=params) return res
def retrieve(self, service, query, methods='query', output="tab25", firstresult=None, maxresults=None, compressed=True): """ Send a query to a specific database :param methods: interation , interactor or query :param maxresults: max results :param firstresult: pos of first result :param str service: a registered service. See :attr:`registry_names`. :param str query: a valid query. Can be `*` or a protein name. :param str output: a valid format. See s._formats :param compressed: gzipped or not data, speedup and requests unzipped auto s.query("intact", "brca2", "tab27") s.query("intact", "zap70", "xml25") s.query("matrixdb", "*", "xml25") This is the programmatic approach to this website: http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml Another example consist in accessing the *string* database for fetching protein-protein interaction data of a particular model organism. Here we restrict the query to 100 results:: s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25") # spaces are automatically converted s.query("biogrid", "ZAP70 AND species:9606") warning:: AND must be in big caps. Some database are ore permissive than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more permissive and may accept the name (e.g., human) To obtain the number of interactions in intact for the human specy:: len(p.query("intact", "species:9606")) """ if methods not in self._retrieve_methods: raise ValueError( "Retrieve methods {} don't exists".format(methods)) if service not in self.activeDBs: raise ValueError("database %s not in active databases" % service) params = {} if output is not None: check_param_in_list(output, self.formats) params['format'] = output else: output = "none" resturl = self.__get_rest_url(service) if firstresult is not None: params['firstResult'] = firstresult if maxresults is not None: params['maxResults'] = maxresults if compressed: params['compressed'] = 'y' url = resturl + 'query/' + query if "xml" in output: res = self.http_get(url, frmt="xml", params=params) else: res = self.http_get(url, frmt="txt", params=params) if res is not None: res = res.strip().split("\n") if output.startswith("tab"): if res is not None: res = self._convert_tab2dict(res) return res
def _search(self, mode, **kargs): """ common function to search for files or experiments """ assert mode in ["experiments", "files"] url = "{0}/{1}/{2}".format(self.format, self.version, mode) defaults = { "accession": None, # ex: E-MEXP-31 "keywords": None, "species": None, "wholewords": "on", "expdesign": None, "exptype": None, "gxa": "true", "pmid": None, "sa": None, "ef": None, # e.g., CellType "efv": None, # e.g., HeLa "array": None, # ex: A-AFFY-33 "expandfo": "on", "directsub": "true", "sortby": [ "accession", "name", "assays", "species", "releasedate", "fgem", "raw", "atlas" ], "sortorder": ["ascending", "descending"], } for k in kargs.keys(): check_param_in_list(k, list(defaults.keys())) # if len(kargs.keys()): # url += "?" params = {} for k, v in kargs.items(): if k in ["expandfo", "wholewords"]: if v in ["on", True, "true", "TRUE", "True"]: # params.append(k + "=on") params[k] = "on" elif k in ["gxa", "directsub"]: if v in ["on", True, "true", "TRUE", "True"]: # params.append(k + "=true") params[k] = "true" elif v in [False, "false", "False"]: # params.append(k + "=false") params[k] = "false" else: raise ValueError("directsub must be true or false") else: if k in ["sortby", "sortorder"]: check_param_in_list(v, defaults[k]) # params.append(k + "=" + v) params[k] = v # NOTE: + is a special character that is replaced by %2B # The + character is the proper encoding for a space when quoting # GET or POST data. Thus, a literal + character needs to be escaped # as well, lest it be decoded to a space on the other end for k, v in params.items(): params[k] = v.replace("+", " ") res = self.http_get(url, frmt=self.format, params=params) if self.format == "xml": res = self.easyXML(res) return res
def _set_format(self, f): check_param_in_list(f, ["json", "xml"]) self._format = f
def search(self, query, frmt="tab", columns=None, include=False, sort="score", compress=False, limit=None, offset=None): """ Provide some interface to the uniprot search interface. :param str query: query must be a valid uniprot query. See http://www.uniprot.org/help/text-search, http://www.uniprot.org/help/query-fields See also example below :param str frmt: a valid format amongst html, tab, xls, asta, gff, txt, xml, rdf, list, rss. If tab or xls, you can also provide the columns argument. (default is tab) :param str columns: comma-separated list of values. Works only if fomat is tab or xls. For UnitProtKB, some possible columns are: id, entry name, length, organism. Some column name must be followed by database name (e.g., "database(PDB)"). Again, see uniprot website for more details. See also :attr:`~bioservices.uniprot.UniProt._valid_columns` for the full list of column keyword. :param bool include: include isoform sequences when the frmt parameter is fasta. Include description when frmt is rdf. :param str sort: by score by default. Set to None to bypass this behaviour :param bool compress: gzip the results :param int limit: Maximum number of results to retrieve. :param int offset: Offset of the first result, typically used together with the limit parameter. To obtain the list of uniprot ID returned by the search of zap70 can be retrieved as follows:: u.search('zap70+AND+organism:9606', frmt='list') u.search("zap70+and+taxonomy:9606", frmt="tab", limit=3, columns="entry name,length,id, genes") Entry name Length Entry Gene names CBLB_HUMAN 982 Q13191 CBLB RNF56 Nbla00127 CBL_HUMAN 906 P22681 CBL CBL2 RNF55 CD3Z_HUMAN 164 P20963 CD247 CD3Z T3Z TCRZ other examples:: u.search("ZAP70+AND+organism:9606", limit=3, columns="id,database(PDB)") You can also do a search on several keywords. This is especially useful if you have a list of known entry names.:: u.search("ZAP70_HUMAN+or+CBL_HUMAN", frmt="tab", limit=3, columns="entry name,length,id, genes") Entry name Length Entry Gene names .. warning:: this function request seems a bit unstable (UniProt web issue ?) so we repeat the request if it fails .. warning:: some columns although valid may not return anything, not even in the header: 'score', 'taxonomy', 'tools'. this is a uniprot feature """ params = {} if frmt is not None: _valid_formats = [ 'tab', 'xls', 'fasta', 'gff', 'txt', 'xml', 'rss', 'list', 'rss', 'html' ] check_param_in_list(frmt, _valid_formats) params['format'] = frmt if columns is not None: check_param_in_list(frmt, ["tab", "xls"]) if "," in columns: columns = [x.strip() for x in columns.split(",")] else: columns = [columns] for col in columns: if col.startswith("database(") is True: pass else: check_param_in_list(col, self._valid_columns) params['columns'] = ",".join([x.strip() for x in columns]) if include is True and frmt in ["fasta", "rdf"]: params['include'] = 'yes' if compress is True: params['compress'] = 'yes' if sort: check_param_in_list(sort, ["score"]) params['sort'] = sort if offset is not None: if isinstance(offset, int): params['offset'] = offset if limit is not None: if isinstance(limit, int): params['limit'] = limit params['query'] = query.replace("+", " ") res = self.http_get("uniprot/", frmt="txt", params=params, headers=self.__headers) return res
def search(self, query, frmt='json', facets=None, number=None, filters=None): """ Search for a complex inside intact complex. :param query: the query (e.g., ndc80) :param frmt: Defaults to json (could be a Pandas data frame if Pandas is installed; set frmt to 'pandas') :param facets: lists of facets as a string (separated by comma) :param number: :param filters: list of filters. s = Intact() # search for ndc80 s.search('ncd80') # Search for ndc80 and facet with the species field: s.search('ncd80', facets='species_f') # Search for ndc80 and facet with the species and biological role fields: s.search('ndc80', facets='species_f,pbiorole_f') # Search for ndc80, facet with the species and biological role # fields and filter the species using human: s.search('Ndc80', first=0, number=10, filters='species_f:("H**o sapiens")', facets='species_f,ptype_f,pbiorole_f') # Search for ndc80, facet with the species and biological role # fields and filter the species using human or mouse: s.search('Ndc80, first=0, number=10, filters='species_f:("H**o sapiens" "Mus musculus")', facets='species_f,ptype_f,pbiorole_f') # Search with a wildcard to retrieve all the information: s.search('*') # Search with a wildcard to retrieve all the information and facet # with the species, biological role and interactor type fields: s.search('*', facets='species_f,pbiorole_f,ptype_f') # Search with a wildcard to retrieve all the information, facet with # the species, biological role and interactor type fields and filter # the interactor type using small molecule: s.search('*', facets='species_f,pbiorole_f,ptype_f', filters='ptype_f:("small molecule")' # Search with a wildcard to retrieve all the information, facet with # the species, biological role and interactor type fields and filter # the interactor type using small molecule and the species using human: s.search('*', facets='species_f,pbiorole_f,ptype_f', filters='ptype_f:("small molecule"),species_f:("H**o sapiens")') # Search for GO:0016491 and paginate (first is for the offset and number # is how many do you want): s.search('GO:0016491', first=10, number=10) """ check_param_in_list(frmt, ['pandas', 'json']) # note that code format to be json, which is the only option so # we can use pandas as a frmt without addition code. params = { 'format': 'json', 'facets': facets, 'first': None, 'number': number, 'filters': filters } result = self.http_get('search/' + query, frmt="json", params=params) # if isinstance(result, int): # raise ValueError("Got a number from Intact request. Check validity of the arguments ") if frmt == 'pandas': import pandas as pd df = pd.DataFrame(result['elements']) return df else: return result