def get(self, service, **kwargs): """Does HTTP request to BOLD webservice. Args: service: The BOLD API alias to interact with. kwargs: Paramenters send by users. Returns: A Response class containing parsed data as attribute `items`. """ params = '' if service == 'call_id': sequence = utils._prepare_sequence(kwargs['seq']) params = _urlencode({'db': kwargs['db'], 'sequence': sequence}) if service == 'call_taxon_search': if kwargs['fuzzy'] is True: fuzzy = 'true' else: fuzzy = 'false' params = _urlencode({ 'taxName': kwargs['taxonomic_identification'], 'fuzzy': fuzzy, }) if service == 'call_taxon_data': if kwargs['include_tree'] is False: params = _urlencode({ 'taxId': kwargs['tax_id'], 'dataTypes': kwargs['data_type'], }) else: params = _urlencode({ 'taxId': kwargs['tax_id'], 'dataTypes': kwargs['data_type'], 'includeTree': 'true', }) if service == 'call_specimen_data' or service == 'call_sequence_data' or \ service == 'call_full_data' or service == 'call_trace_files': payload = dict() for k, v in kwargs.items(): if v is not None and k != 'url': payload[k] = v params = _urlencode(payload) url = kwargs['url'] + "?" + params req = _Request(url, headers={'User-Agent': 'BiopythonClient'}) handle = _urlopen(req) response = Response() if service == 'call_trace_files': binary_result = handle.read() response._parse_data(service, binary_result) else: result = _as_string(handle.read()) response._parse_data(service, result) return response
def _open(cgi, params={}, post=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in params.items(): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if not "tool" in params: params["tool"] = tool # Tell Entrez who we are if not "email" in params: if email is not None: params["email"] = email else: warnings.warn(""" Email address is not specified. To make use of NCBI's E-utilities, NCBI strongly recommends you to specify your email address with each request. From June 1, 2010, this will be mandatory. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # Open a handle to Entrez. options = _urlencode(params, doseq=True) #print cgi + "?" + options try: if post: #HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: #HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def search(text, output_format="tab", sort="score", oragnism="", columns=(), isoform=False, compress=False, offset=0, limit=0): """Perform a query over the UniProt API. More at: https://www.uniprot.org/help/api_queries """ cgi = "https://www.uniprot.org/uniprot/?" variables = { "query": text, "format": output_format, "sort": sort, "offset": str(offset) } if oragnism: variables["organism"] = oragnism if columns: variables["columns"] = ",".join(columns) if isoform: variables["isoform"] = "Yes" if compress: variables["compress"] = "Yes" if limit: variables["limit"] = str(limit) fullcgi = "".join((cgi, _urlencode(variables))) return _binary_to_string_handle(_urlopen(fullcgi))
def _open(url, post=None): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to TogoWS, will raise an IOError if it encounters an error. In the absense of clear guidelines, this function enforces a limit of "up to three queries per second" to avoid abusing the TogoWS servers. """ delay = 0.333333333 # one third of a second current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current #print(url) if post: handle = _urlopen(url, _as_bytes(_urlencode(post))) else: handle = _urlopen(url) #We now trust TogoWS to have set an HTTP error code, that #suffices for my current unit tests. Previously we would #examine the start of the data returned back. return _binary_to_string_handle(handle)
def _encode_options(ecitmatch, params): # Open a handle to Entrez. options = _urlencode(params, doseq=True) # _urlencode encodes pipes, which NCBI expects in ECitMatch if ecitmatch: options = options.replace('%7C', '|') return options
def _open(cgi, params={}, post=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in list(params.items()): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if not "tool" in params: params["tool"] = tool # Tell Entrez who we are if not "email" in params: if email is not None: params["email"] = email else: warnings.warn( """ Email address is not specified. To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # Open a handle to Entrez. options = _urlencode(params, doseq=True) #print cgi + "?" + options try: if post: #HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: #HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def scan(seq="", mirror='http://www.expasy.org', output='xml', **keywords): """Execute a ScanProsite search. mirror: The ScanProsite mirror to be used (default: http://www.expasy.org). seq: The query sequence, or UniProtKB (Swiss-Prot, TrEMBL) accession output: Format of the search results (default: xml) Further search parameters can be passed as keywords; see the documentation for programmatic access to ScanProsite at http://www.expasy.org/tools/scanprosite/ScanPrositeREST.html for a description of such parameters. This function returns a handle to the search results returned by ScanProsite. Search results in the XML format can be parsed into a Python object, by using the Bio.ExPASy.ScanProsite.read function. """ parameters = {'seq': seq, 'output': output} for key, value in keywords.items(): if value is not None: parameters[key] = value command = _urlencode(parameters) url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command) handle = _urlopen(url) return handle
def scan(seq="", mirror='https://www.expasy.org', output='xml', **keywords): """Execute a ScanProsite search. Arguments: - mirror: The ScanProsite mirror to be used (default: https://www.expasy.org). - seq: The query sequence, or UniProtKB (Swiss-Prot, TrEMBL) accession - output: Format of the search results (default: xml) Further search parameters can be passed as keywords; see the documentation for programmatic access to ScanProsite at https://www.expasy.org/tools/scanprosite/ScanPrositeREST.html for a description of such parameters. This function returns a handle to the search results returned by ScanProsite. Search results in the XML format can be parsed into a Python object, by using the Bio.ExPASy.ScanProsite.read function. """ parameters = {'seq': seq, 'output': output} for key, value in keywords.items(): if value is not None: parameters[key] = value command = _urlencode(parameters) url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command) handle = _urlopen(url) return handle
def sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de'): """Search SwissProt (BROKEN). Search by name, description, gene name, species, or organelle. """ variables = {'SEARCH': text} if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) return _binary_to_string_handle(_urlopen(fullcgi))
def sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful'): """Search SwissProt by full text (BROKEN).""" variables = {'SEARCH': text} if make_wild: variables['makeWild'] = 'on' if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _binary_to_string_handle(_urlopen(fullcgi)) return handle
def sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de'): """sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de') -> handle Search SwissProt by name, description, gene name, species, or organelle. """ variables = {'SEARCH': text} if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful'): """sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful') -> handle Search SwissProt by full text. """ variables = {'SEARCH': text} if make_wild: variables['makeWild'] = 'on' if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def fill_hot_cache(self): url = self.url + _urlencode(self.query) fh = _urlopen(url) self.hot_cache = fh.read() fh.close()
def qblast( program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query="(none)", expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, short_query=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type="XML", ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, ): """BLAST search using NCBI's QBLAST server or a cloud service provider. Supports all parameters of the old qblast API for Put and Get. Please note that NCBI uses the new Common URL API for BLAST searches on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, some of the parameters used by this function are not (or are no longer) officially supported by NCBI. Although they are still functioning, this may change in the future. The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows doing BLAST searches on cloud servers. To use this feature, please set ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` and ``format_object='Alignment'``. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - short_query TRUE/FALSE whether to adjust the search parameters for a short query sequence. Note that this will override manually set parameters like word size and e value. Turns off when sequence length is > 30 residues. Default: None. - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: https://ncbi.github.io/blast-cloud/dev/api.html """ import time programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"] if program not in programs: raise ValueError("Program specified is %s. Expected one of %s" % (program, ", ".join(programs))) # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter # assignment from NCBIs side). # Thus we set the (known) parameters directly: if short_query and program == "blastn": short_query = None # We only use the 'short-query' parameters for short sequences: if len(sequence) < 31: expect = 1000 word_size = 7 nucl_reward = 1 filter = None lcase_mask = None warnings.warn( '"SHORT_QUERY_ADJUST" is incorrectly implemented ' "(by NCBI) for blastn. We bypass the problem by " "manually adjusting the search parameters. Thus, " "results may slightly differ from web page " "searches.", BiopythonWarning) # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ("AUTO_FORMAT", auto_format), ("COMPOSITION_BASED_STATISTICS", composition_based_statistics), ("DATABASE", database), ("DB_GENETIC_CODE", db_genetic_code), ("ENDPOINTS", endpoints), ("ENTREZ_QUERY", entrez_query), ("EXPECT", expect), ("FILTER", filter), ("GAPCOSTS", gapcosts), ("GENETIC_CODE", genetic_code), ("HITLIST_SIZE", hitlist_size), ("I_THRESH", i_thresh), ("LAYOUT", layout), ("LCASE_MASK", lcase_mask), ("MEGABLAST", megablast), ("MATRIX_NAME", matrix_name), ("NUCL_PENALTY", nucl_penalty), ("NUCL_REWARD", nucl_reward), ("OTHER_ADVANCED", other_advanced), ("PERC_IDENT", perc_ident), ("PHI_PATTERN", phi_pattern), ("PROGRAM", program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ("QUERY", sequence), ("QUERY_FILE", query_file), ("QUERY_BELIEVE_DEFLINE", query_believe_defline), ("QUERY_FROM", query_from), ("QUERY_TO", query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ("SEARCHSP_EFF", searchsp_eff), ("SERVICE", service), ("SHORT_QUERY_ADJUST", short_query), ("TEMPLATE_TYPE", template_type), ("TEMPLATE_LENGTH", template_length), ("THRESHOLD", threshold), ("UNGAPPED_ALIGNMENT", ungapped_alignment), ("WORD_SIZE", word_size), ("CMD", "Put"), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ("ALIGNMENTS", alignments), ("ALIGNMENT_VIEW", alignment_view), ("DESCRIPTIONS", descriptions), ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window), ("EXPECT_LOW", expect_low), ("EXPECT_HIGH", expect_high), ("FORMAT_ENTREZ_QUERY", format_entrez_query), ("FORMAT_OBJECT", format_object), ("FORMAT_TYPE", format_type), ("NCBI_GI", ncbi_gi), ("RID", rid), ("RESULTS_FILE", results_file), ("SERVICE", service), ("SHOW_OVERVIEW", show_overview), ("CMD", "Get"), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # 1. Do not contact the server more often than once every 10 seconds. # 2. Do not poll for any single RID more often than once a minute. # 3. Use the URL parameter email and tool, so that the NCBI # can contact you if there is a problem. # 4. Run scripts weekends or between 9 pm and 5 am Eastern time # on weekdays if more than 50 searches will be submitted. # -- # Could start with a 10s delay, but expect most short queries # will take longer thus at least 70s with delay. Therefore, # start with 20s delay, thereafter once a minute. delay = 20 # seconds while True: current = time.time() wait = qblast._previous + delay - current if wait > 0: time.sleep(wait) qblast._previous = current + wait else: qblast._previous = current # delay by at least 60 seconds only if running the request against the public NCBI API if delay < 60 and url_base == NCBI_BLAST_URL: # Wasn't a quick return, must wait at least a minute delay = 60 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def qblast( program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, ): """BLAST search using NCBI's QBLAST server or a cloud service provider. Supports all parameters of the qblast API for Put and Get. Please note that BLAST on the cloud supports the NCBI-BLAST Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To use this feature, please set url_base to 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and format_object='Alignment'. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: https://ncbi.github.io/blast-cloud/dev/api.html """ import time programs = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] if program not in programs: raise ValueError("Program specified is %s. Expected one of %s" % (program, ", ".join(programs))) # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('TEMPLATE_TYPE', template_type), ('TEMPLATE_LENGTH', template_length), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # 1. Do not contact the server more often than once every 10 seconds. # 2. Do not poll for any single RID more often than once a minute. # 3. Use the URL parameter email and tool, so that the NCBI # can contact you if there is a problem. # 4. Run scripts weekends or between 9 pm and 5 am Eastern time # on weekdays if more than 50 searches will be submitted. # -- # Could start with a 10s delay, but expect most short queries # will take longer thus at least 70s with delay. Therefore, # start with 20s delay, thereafter once a minute. delay = 20 # seconds while True: current = time.time() wait = qblast._previous + delay - current if wait > 0: time.sleep(wait) qblast._previous = current + wait else: qblast._previous = current # delay by at least 60 seconds only if running the request against the public NCBI API if delay < 60 and url_base == NCBI_BLAST_URL: # Wasn't a quick return, must wait at least a minute delay = 60 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def qblast(program, database, sequence, url_base=NCBI_BLAST_URL, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI or a cloud service provider. Supports all parameters of the qblast API for Put and Get. Please note that BLAST on the cloud supports the NCBI-BLAST Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To use this feature, please set url_base to 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and format_object='Alignment'. For more details, please see https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast Some useful parameters: - program blastn, blastp, blastx, tblastn, or tblastx (lower case) - database Which database to search against (e.g. "nr"). - sequence The sequence to search. - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. - descriptions Number of descriptions to show. Def 500. - alignments Number of alignments to show. Def 500. - expect An expect value cutoff. Def 10.0. - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). - filter "none" turns off filtering. Default no filtering - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". - entrez_query Entrez query to limit Blast search - hitlist_size Number of hits to return. Default 50 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) - service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), # ('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5 * delay <= 120: delay += .5 * delay else: delay = 120 request = _Request(url_base, message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def qblast( program, database, sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), #('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5 * delay <= 120: delay += .5 * delay else: delay = 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent": "BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results == "\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i + len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def _open(cgi, params=None, ecitmatch=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers, and makes the request through POST rather than GET if the number of characters in the resulting query is greater than 1000. """ if params is None: params = {} # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in list(params.items()): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if "tool" not in params: params["tool"] = tool # Tell Entrez who we are if "email" not in params: if email is not None: params["email"] = email else: warnings.warn( """ Email address is not specified. To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # By default, we do not force a POST request force_post = False # Make sure the UIDs are in the format UID,UID,... ids = params.get("id", None) if ids is not None: # Detect whether 200+ UIDs have been provided, and convert the list # [UID, UID, ...] into the string "UID,UID,..." if isinstance(ids, list): params["id"] = ",".join(ids) elif isinstance(ids, str): ids = ids.split(",") # If 200+ UIDs are given, force the POST request force_post = len(ids) > 200 # Open a handle to Entrez. options = _urlencode(params, doseq=True) # _urlencode encodes pipes, which NCBI expects in ECitMatch if ecitmatch: options = options.replace('%7C', '|') # print cgi + "?" + options post = force_post or len(options) > 1000 try: if post: # HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: # HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def _open(cgi, params=None, ecitmatch=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers, and makes the request through POST rather than GET if the number of characters in the resulting query is greater than 1000. """ if params is None: params = {} # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in list(params.items()): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if "tool" not in params: params["tool"] = tool # Tell Entrez who we are if "email" not in params: if email is not None: params["email"] = email else: warnings.warn(""" Email address is not specified. To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # By default, we do not force a POST request force_post = False # Make sure the UIDs are in the format UID,UID,... ids = params.get("id", None) if ids is not None: # Detect whether 200+ UIDs have been provided, and convert the list # [UID, UID, ...] into the string "UID,UID,..." if isinstance(ids, list): params["id"] = ",".join(ids) elif isinstance(ids, str): ids = ids.split(",") # If 200+ UIDs are given, force the POST request force_post = len(ids) > 200 # Open a handle to Entrez. options = _urlencode(params, doseq=True) # _urlencode encodes pipes, which NCBI expects in ECitMatch if ecitmatch: options = options.replace('%7C', '|') # print cgi + "?" + options post = force_post or len(options) > 1000 try: if post: # HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: # HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def _open(cgi, params=None, post=None, ecitmatch=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. The arugment post should be a boolean to explicitly control if an HTTP POST should be used rather an HTTP GET based on the query length. By default (post=None), POST is used if the query URL would be over 1000 characters long. The arugment post should be a boolean to explicitly control if an HTTP POST should be used rather an HTTP GET based on the query length. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ if params is None: params = {} # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in list(params.items()): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if "tool" not in params: params["tool"] = tool # Tell Entrez who we are if "email" not in params: if email is not None: params["email"] = email else: warnings.warn(""" Email address is not specified. To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # Open a handle to Entrez. options = _urlencode(params, doseq=True) # _urlencode encodes pipes, which NCBI expects in ECitMatch if ecitmatch: options = options.replace('%7C', '|') # print cgi + "?" + options # By default, post is None. Set to a boolean to over-ride length choice: if post is None and len(options) > 1000: post = True try: if post: # HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: # HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)