def _load_bgzf_block(handle, text_mode=False): """Internal function to load the next BGZF function (PRIVATE).""" magic = handle.read(4) if not magic: #End of file raise StopIteration if magic != _bgzf_magic: raise ValueError(r"A BGZF (e.g. a BAM file) block should start with " r"%r, not %r; handle.tell() now says %r" % (_bgzf_magic, magic, handle.tell())) gzip_mod_time, gzip_extra_flags, gzip_os, extra_len = \ struct.unpack("<LBBH", handle.read(8)) block_size = None x_len = 0 while x_len < extra_len: subfield_id = handle.read(2) subfield_len = struct.unpack("<H", handle.read(2))[0] # uint16_t subfield_data = handle.read(subfield_len) x_len += subfield_len + 4 if subfield_id == _bytes_BC: assert subfield_len == 2, "Wrong BC payload length" assert block_size is None, "Two BC subfields?" block_size = struct.unpack("<H", subfield_data)[0] + 1 # uint16_t assert x_len == extra_len, (x_len, extra_len) assert block_size is not None, "Missing BC, this isn't a BGZF file!" #Now comes the compressed data, CRC, and length of uncompressed data. deflate_size = block_size - 1 - extra_len - 19 d = zlib.decompressobj(-15) # Negative window size means no headers data = d.decompress(handle.read(deflate_size)) + d.flush() expected_crc = handle.read(4) expected_size = struct.unpack("<I", handle.read(4))[0] assert expected_size == len(data), \ "Decompressed to %i, not %i" % (len(data), expected_size) #Should cope with a mix of Python platforms... crc = zlib.crc32(data) if crc < 0: crc = struct.pack("<i", crc) else: crc = struct.pack("<I", crc) assert expected_crc == crc, \ "CRC is %s, not %s" % (crc, expected_crc) if text_mode: return block_size, _as_string(data) else: return block_size, data
def _read(handle): record = None unread = "" for line in handle: #This is for Python 3 to cope with a binary handle (byte strings), #or a text handle (unicode strings): line = _as_string(line) key, value = line[:2], line[5:].rstrip() if unread: value = unread + " " + value unread = "" if key == '**': #See Bug 2353, some files from the EBI have extra lines #starting "**" (two asterisks/stars). They appear #to be unofficial automated annotations. e.g. #** #** ################# INTERNAL SECTION ################## #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. pass elif key == 'ID': record = Record() _read_id(record, line) _sequence_lines = [] elif key == 'AC': accessions = [word for word in value.rstrip(";").split("; ")] record.accessions.extend(accessions) elif key == 'DT': _read_dt(record, line) elif key == 'DE': record.description.append(value.strip()) elif key == 'GN': if record.gene_name: record.gene_name += " " record.gene_name += value elif key == 'OS': record.organism.append(value) elif key == 'OG': record.organelle += line[5:] elif key == 'OC': cols = [col for col in value.rstrip(";.").split("; ")] record.organism_classification.extend(cols) elif key == 'OX': _read_ox(record, line) elif key == 'OH': _read_oh(record, line) elif key == 'RN': reference = Reference() _read_rn(reference, value) record.references.append(reference) elif key == 'RP': assert record.references, "RP: missing RN" record.references[-1].positions.append(value) elif key == 'RC': assert record.references, "RC: missing RN" reference = record.references[-1] unread = _read_rc(reference, value) elif key == 'RX': assert record.references, "RX: missing RN" reference = record.references[-1] _read_rx(reference, value) elif key == 'RL': assert record.references, "RL: missing RN" reference = record.references[-1] reference.location.append(value) # In UniProt release 1.12 of 6/21/04, there is a new RG # (Reference Group) line, which references a group instead of # an author. Each block must have at least 1 RA or RG line. elif key == 'RA': assert record.references, "RA: missing RN" reference = record.references[-1] reference.authors.append(value) elif key == 'RG': assert record.references, "RG: missing RN" reference = record.references[-1] reference.authors.append(value) elif key == "RT": assert record.references, "RT: missing RN" reference = record.references[-1] reference.title.append(value) elif key == 'CC': _read_cc(record, line) elif key == 'DR': _read_dr(record, value) elif key == 'PE': #TODO - Record this information? pass elif key == 'KW': cols = value.rstrip(";.").split('; ') record.keywords.extend(cols) elif key == 'FT': _read_ft(record, line) elif key == 'SQ': cols = value.split() assert len(cols) == 7, "I don't understand SQ line %s" % line # Do more checking here? record.seqinfo = int(cols[1]), int(cols[3]), cols[5] elif key == ' ': _sequence_lines.append(value.replace(" ", "").rstrip()) elif key == '//': # Join multiline data into one string record.description = " ".join(record.description) record.organism = " ".join(record.organism) record.organelle = record.organelle.rstrip() for reference in record.references: reference.authors = " ".join(reference.authors).rstrip(";") reference.title = " ".join(reference.title).rstrip(";") if reference.title.startswith('"') and reference.title.endswith('"'): reference.title = reference.title[1:-1] # remove quotes reference.location = " ".join(reference.location) record.sequence = "".join(_sequence_lines) return record else: raise ValueError("Unknown keyword '%s' found" % key) if record: raise ValueError("Unexpected end of stream.")
def _read(handle): record = None unread = "" for line in handle: #This is for Python 3 to cope with a binary handle (byte strings), #or a text handle (unicode strings): line = _as_string(line) key, value = line[:2], line[5:].rstrip() if unread: value = unread + " " + value unread = "" if key == '**': #See Bug 2353, some files from the EBI have extra lines #starting "**" (two asterisks/stars). They appear #to be unofficial automated annotations. e.g. #** #** ################# INTERNAL SECTION ################## #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. pass elif key == 'ID': record = Record() _read_id(record, line) _sequence_lines = [] elif key == 'AC': accessions = [word for word in value.rstrip(";").split("; ")] record.accessions.extend(accessions) elif key == 'DT': _read_dt(record, line) elif key == 'DE': record.description.append(value.strip()) elif key == 'GN': if record.gene_name: record.gene_name += " " record.gene_name += value elif key == 'OS': record.organism.append(value) elif key == 'OG': record.organelle += line[5:] elif key == 'OC': cols = [col for col in value.rstrip(";.").split("; ")] record.organism_classification.extend(cols) elif key == 'OX': _read_ox(record, line) elif key == 'OH': _read_oh(record, line) elif key == 'RN': reference = Reference() _read_rn(reference, value) record.references.append(reference) elif key == 'RP': assert record.references, "RP: missing RN" record.references[-1].positions.append(value) elif key == 'RC': assert record.references, "RC: missing RN" reference = record.references[-1] unread = _read_rc(reference, value) elif key == 'RX': assert record.references, "RX: missing RN" reference = record.references[-1] _read_rx(reference, value) elif key == 'RL': assert record.references, "RL: missing RN" reference = record.references[-1] reference.location.append(value) # In UniProt release 1.12 of 6/21/04, there is a new RG # (Reference Group) line, which references a group instead of # an author. Each block must have at least 1 RA or RG line. elif key == 'RA': assert record.references, "RA: missing RN" reference = record.references[-1] reference.authors.append(value) elif key == 'RG': assert record.references, "RG: missing RN" reference = record.references[-1] reference.authors.append(value) elif key == "RT": assert record.references, "RT: missing RN" reference = record.references[-1] reference.title.append(value) elif key == 'CC': _read_cc(record, line) elif key == 'DR': _read_dr(record, value) elif key == 'PE': #TODO - Record this information? pass elif key == 'KW': cols = value.rstrip(";.").split('; ') record.keywords.extend(cols) elif key == 'FT': _read_ft(record, line) elif key == 'SQ': cols = value.split() assert len(cols) == 7, "I don't understand SQ line %s" % line # Do more checking here? record.seqinfo = int(cols[1]), int(cols[3]), cols[5] elif key == ' ': _sequence_lines.append(value.replace(" ", "").rstrip()) elif key == '//': # Join multiline data into one string record.description = " ".join(record.description) record.organism = " ".join(record.organism) record.organelle = record.organelle.rstrip() for reference in record.references: reference.authors = " ".join(reference.authors).rstrip(";") reference.title = " ".join(reference.title).rstrip(";") if reference.title.startswith( '"') and reference.title.endswith('"'): reference.title = reference.title[1:-1] # remove quotes reference.location = " ".join(reference.location) record.sequence = "".join(_sequence_lines) return record else: raise ValueError("Unknown keyword '%s' found" % key) if record: raise ValueError("Unexpected end of stream.")
def qblast(program, database, sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), #('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5*delay <= 120: delay += .5*delay else: delay = 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results=="\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i+len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
def _parse_qblast_ref_page(handle): """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 'Request Time of Execution' and RID would be 'Request Identifier'. """ s = _as_string(handle.read()) i = s.find("RID =") if i == -1: rid = None else: j = s.find("\n", i) rid = s[i+len("RID ="):j].strip() i = s.find("RTOE =") if i == -1: rtoe = None else: j = s.find("\n", i) rtoe = s[i+len("RTOE ="):j].strip() if not rid and not rtoe: #Can we reliably extract the error message from the HTML page? #e.g. "Message ID#24 Error: Failed to read the Blast query: # Nucleotide FASTA provided for protein sequence" #or "Message ID#32 Error: Query contains no data: Query # contains no sequence data" # #This used to occur inside a <div class="error msInf"> entry: i = s.find('<div class="error msInf">') if i != -1: msg = s[i+len('<div class="error msInf">'):].strip() msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() if msg: raise ValueError("Error message from NCBI: %s" % msg) #In spring 2010 the markup was like this: i = s.find('<p class="error">') if i != -1: msg = s[i+len('<p class="error">'):].strip() msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() if msg: raise ValueError("Error message from NCBI: %s" % msg) #Generic search based on the way the error messages start: i = s.find('Message ID#') if i != -1: #Break the message at the first HTML tag msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() raise ValueError("Error message from NCBI: %s" % msg) #We didn't recognise the error layout :( #print s raise ValueError("No RID and no RTOE found in the 'please wait' page, " "there was probably an error in your request but we " "could not extract a helpful error message.") elif not rid: #Can this happen? raise ValueError("No RID found in the 'please wait' page." " (although RTOE = %s)" % repr(rtoe)) elif not rtoe: #Can this happen? raise ValueError("No RTOE found in the 'please wait' page." " (although RID = %s)" % repr(rid)) try: return rid, int(rtoe) except ValueError: raise ValueError("A non-integer RTOE found in " +"the 'please wait' page, %s" % repr(rtoe))