def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We cannot assume the record.id is the first word after ID, #normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def _get_raw_qresult(self, offset): """Returns the raw string of a single QueryResult from a noncommented file.""" handle = self._handle handle.seek(offset) qresult_raw = _as_bytes('') tab_char = _as_bytes('\t') key_idx = self._key_idx qresult_key = None while True: line = handle.readline() # get the key if the first line (qresult key) if qresult_key is None: qresult_key = line.split(tab_char)[key_idx] else: try: curr_key = line.split(tab_char)[key_idx] except IndexError: curr_key = _as_bytes('') # only break when qresult is finished (key is different) if curr_key != qresult_key: break # append to the raw string as long as qresult is the same qresult_raw += line return qresult_raw
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We cannot assume the record.id is the first word after ID, # normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def _qresult_index_commented(self): """Indexer for commented BLAST tabular files.""" handle = self._handle handle.seek(0) start_offset = 0 # mark of a new query query_mark = None # mark of the query's ID qid_mark = _as_bytes('# Query: ') # mark of the last line end_mark = _as_bytes('# BLAST processed') while True: end_offset = handle.tell() line = handle.readline() if query_mark is None: query_mark = line start_offset = end_offset elif line.startswith(qid_mark): qresult_key = line[len(qid_mark):].split()[0] elif line == query_mark or line.startswith(end_mark): yield qresult_key, start_offset, end_offset - start_offset start_offset = end_offset elif not line: break
def _qresult_index(self): """Indexer for noncommented BLAST tabular files.""" handle = self._handle handle.seek(0) start_offset = 0 qresult_key = None key_idx = self._key_idx tab_char = _as_bytes('\t') while True: # get end offset here since we only know a qresult ends after # encountering the next one end_offset = handle.tell() #line = handle.readline() line = handle.readline() if qresult_key is None: qresult_key = line.split(tab_char)[key_idx] else: try: curr_key = line.split(tab_char)[key_idx] except IndexError: curr_key = _as_bytes('') if curr_key != qresult_key: yield qresult_key, start_offset, end_offset - start_offset qresult_key = curr_key start_offset = end_offset # break if we've reached EOF if not line: break
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def get_raw(self, offset): handle = self._handle qresult_raw = _as_bytes('') query_mark = _as_bytes('>>>') # read header first handle.seek(0) while True: line = handle.readline() peekline = handle.peekline() qresult_raw += line if not peekline.startswith(query_mark) and query_mark in peekline: break # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() peekline = handle.peekline() qresult_raw += line # break when we've reached qresult end if (not peekline.startswith(query_mark) and query_mark in peekline) or \ not line: break # append mock end marker to qresult_raw, since it's not always present return qresult_raw + _as_bytes('>>><<<\n')
class Hmmer3TextIndexer(_BaseHmmerTextIndexer): """Indexer class for HMMER plain text output.""" _parser = Hmmer3TextParser qresult_start = _as_bytes('Query: ') qresult_end = _as_bytes('//') def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def crc32(seq): """Returns the crc32 checksum for a sequence (string or Seq object).""" #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned #Docs suggest should use crc32(x) & 0xffffffff for consistency. #TODO - Should we return crc32(x) & 0xffffffff here? try: #Assume its a Seq object return _crc32(_as_bytes(str(seq))) except AttributeError: #Assume its a string/unicode return _crc32(_as_bytes(seq))
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: #Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: #assert line[0]=="@" #This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] #Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") #assert line[0]=="+" #Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: #Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) #Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def get_raw(self, offset): handle = self._handle handle.seek(offset) marker_re = self._marker_re lines = [] line = handle.readline() semi_char = _as_bytes(";") while line.startswith(semi_char): lines.append(line) line = handle.readline() while line and not line.startswith(semi_char): lines.append(line) line = handle.readline() return _as_bytes("").join(lines)
def get_raw(self, offset): handle = self._handle qresult_raw = _as_bytes('') # read header first if not self._preamble: handle.seek(0) while True: line = handle.readline() if line.startswith(self.qresult_start): break qresult_raw += line else: qresult_raw += self._preamble # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() qresult_raw += line # break when we've reached qresult end if line.startswith(self.qresult_end) or not line: break return qresult_raw
def get_raw(self, offset): handle = self._handle qresult_raw = _as_bytes("") # read header first if not self._preamble: handle.seek(0) while True: line = handle.readline() if line.startswith(self.qresult_start): break qresult_raw += line else: qresult_raw += self._preamble # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() qresult_raw += line # break when we've reached qresult end if line.startswith(self.qresult_end) or not line: break return qresult_raw
def _open(url, post=None): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to TogoWS, will raise an IOError if it encounters an error. In the absense of clear guidelines, this function enforces a limit of "up to three queries per second" to avoid abusing the TogoWS servers. """ delay = 0.333333333 # one third of a second current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current #print(url) if post: handle = _urlopen(url, _as_bytes(post)) else: handle = _urlopen(url) #We now trust TogoWS to have set an HTTP error code, that #suffices for my current unit tests. Previously we would #examine the start of the data returned back. return _binary_to_string_handle(handle)
def seguid(seq): """Returns the SEGUID (string) for a sequence (string or Seq object). Given a nucleotide or amino-acid secuence (or any string), returns the SEGUID string (A SEquence Globally Unique IDentifier). seq type = str. For more information about SEGUID, see: http://bioinformatics.anl.gov/seguid/ DOI: 10.1002/pmic.200600032 """ import hashlib import base64 m = hashlib.sha1() try: #Assume it's a Seq object seq = str(seq) except AttributeError: #Assume it's a string pass m.update(_as_bytes(seq.upper())) try: #For Python 3+ return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") except AttributeError: pass # For all other Pythons return base64.b64encode(m.digest()).rstrip("=")
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We expect the next line to be <accession>xxx</accession> #(possibly with leading spaces) #but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11:].split( less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: #Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError( "Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length #Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after ID, #normally the SV line is used. length = len(line) if line[2:].count(semi_char) == 6: #Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): #The SV bit gives the version key = parts[0].strip() + dot_char + \ parts[1].strip().split()[1] else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: #Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] else: raise ValueError('Did not recognise the ID line layout:\n' + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(sv_marker): key = line.rstrip().split()[1] length += len(line) assert not line, repr(line)
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" #TODO - Refactor this and the __init__ method to reduce code duplication? handle = self._handle handle.seek(offset) line = handle.readline() data = line at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) #Find the seq line(s) seq_len = 0 while line: line = handle.readline() data += line if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") assert line[0:1] == plus_char #Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: #Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) data += line #Should be end of record... line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() data += line qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") return data
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" handle = self._handle marker_re = self._marker_re end_entry_marker = _as_bytes("</entry>") handle.seek(offset) data = [handle.readline()] while True: line = handle.readline() i = line.find(end_entry_marker) if i != -1: data.append(line[: i + 8]) break if marker_re.match(line) or not line: # End of file, or start of next record raise ValueError("Didn't find end of record") data.append(line) return _as_bytes("").join(data)
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" handle = self._handle marker_re = self._marker_re end_entry_marker = _as_bytes("</entry>") handle.seek(offset) data = [handle.readline()] while True: line = handle.readline() i = line.find(end_entry_marker) if i != -1: data.append(line[:i + 8]) break if marker_re.match(line) or not line: #End of file, or start of next record raise ValueError("Didn't find end of record") data.append(line) return _as_bytes("").join(data)
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" # TODO - Refactor this and the __init__ method to reduce code duplication? handle = self._handle handle.seek(offset) line = handle.readline() data = line at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) # Find the seq line(s) seq_len = 0 while line: line = handle.readline() data += line if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") assert line[0:1] == plus_char # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) data += line # Should be end of record... line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() data += line qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") return data
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. length = len(line) if line[2:].count(semi_char) == 6: # Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + dot_char + parts[1].strip().split()[1] else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: # Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] else: raise ValueError("Did not recognise the ID line layout:\n" + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(sv_marker): key = line.rstrip().split()[1] length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We expect the next line to be <accession>xxx</accession> # (possibly with leading spaces) # but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11 :].split(less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: # Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length # Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield _bytes_to_string(qresult_key), start_offset, \ adj_end - start_offset qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
class Hmmer2TextIndexer(_BaseHmmerTextIndexer): """Indexer for hmmer2-text format.""" _parser = Hmmer2TextParser qresult_start = _as_bytes('Query') # qresults_ends for hmmpfam and hmmsearch # need to anticipate both since hmmsearch have different query end mark qresult_end = _as_bytes('//') def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after LOCUS, #normally the first entry on the VERSION or ACCESSION line is used. key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError( "Did not find ACCESSION/VERSION lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char) == 1 and version_id.split( dot_char)[1].isdigit(): #This should mimic the GenBank parser... key = version_id length += len(line) assert not line, repr(line)
def get_raw(self, offset): """Returns the raw string of a QueryResult object from the given offset.""" handle = self._handle handle.seek(offset) query_id_idx = self._query_id_idx qresult_key = None qresult_raw = _as_bytes('') split_mark = _as_bytes(' ') while True: line = handle.readline() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: break qresult_raw += line return qresult_raw
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" #For non-trivial file formats this must be over-ridden in the subclass handle = self._handle marker_re = self._marker_re handle.seek(offset) lines = [handle.readline()] while True: line = handle.readline() if marker_re.match(line) or not line: #End of file, or start of next record => end of this record break lines.append(line) return _as_bytes("").join(lines)
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" # For non-trivial file formats this must be over-ridden in the subclass handle = self._handle marker_re = self._marker_re handle.seek(offset) lines = [handle.readline()] while True: line = handle.readline() if marker_re.match(line) or not line: # End of file, or start of next record => end of this record break lines.append(line) return _as_bytes("").join(lines)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after LOCUS, # normally the first entry on the VERSION or ACCESSION line is used. key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError("Did not find ACCESSION/VERSION lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): # This should mimic the GenBank parser... key = version_id length += len(line) assert not line, repr(line)
def write(self, data): #TODO - Check bytes vs unicode data = _as_bytes(data) #block_size = 2**16 = 65536 data_len = len(data) if len(self._buffer) + data_len < 65536: #print("Cached %r" % data) self._buffer += data return else: #print("Got %r, writing out some data..." % data) self._buffer += data while len(self._buffer) >= 65536: self._write_block(self._buffer[:65536]) self._buffer = self._buffer[65536:]
def get_qresult_id(self, pos): """Returns the query ID from the nearest "Query:" line.""" handle = self._handle handle.seek(pos) sentinel = _as_bytes('Query:') while True: line = handle.readline().strip() if line.startswith(sentinel): break if not line: raise StopIteration qid, desc = _parse_hit_or_query_line(_bytes_to_string(line)) return qid
def _get_raw_qresult_commented(self, offset): """Returns the raw string of a single QueryResult from a commented file.""" handle = self._handle handle.seek(offset) qresult_raw = _as_bytes('') end_mark = _as_bytes('# BLAST processed') # query mark is the line marking a new query # something like '# TBLASTN 2.2.25+' query_mark = None line = handle.readline() while line: # since query_mark depends on the BLAST search, we need to obtain it # first if query_mark is None: query_mark = line # break when we've reached the next qresult or the search ends elif line == query_mark or line.startswith(end_mark): break qresult_raw += line line = handle.readline() return qresult_raw
class ExonerateCigarIndexer(ExonerateVulgarIndexer): """Indexer class for exonerate cigar lines.""" _parser = ExonerateCigarParser _query_mark = _as_bytes('cigar') def get_qresult_id(self, pos): """Returns the query ID of the nearest cigar line.""" handle = self._handle handle.seek(pos) # get line, check if it's a vulgar line, and get query ID line = handle.readline() assert line.startswith(self._query_mark), line id = re.search(_RE_CIGAR, _bytes_to_string(line)) return id.group(1)
def __iter__(self): qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile( _as_bytes(r'<Iteration_query-ID>(.*?)' '</Iteration_query-ID>\s+?<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count( qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count( qend_mark) == 1, "XML without line breaks? %r" % block #Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def read(self, handle): """Set up the parser and let it parse the XML results""" # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser # expects binary data if handle.__class__.__name__ == 'EvilHandleHack': handle = handle._handle if handle.__class__.__name__ == 'TextIOWrapper': handle = handle.buffer if hasattr(handle, "closed") and handle.closed: # Should avoid a possible Segmentation Fault, see: # http://bugs.python.org/issue4877 raise IOError("Can't parse a closed handle") if sys.version_info[0] >= 3: # Another nasty hack to cope with a unicode StringIO handle # since the Entrez XML parser expects binary data (bytes) from io import StringIO if isinstance(handle, StringIO): from io import BytesIO from SAP.Bio._py3k import _as_bytes handle = BytesIO(_as_bytes(handle.read())) try: self.parser.ParseFile(handle) except expat.ExpatError as e: if self.parser.StartElementHandler: # We saw the initial <!xml declaration, so we can be sure that # we are parsing XML data. Most likely, the XML file is # corrupted. raise CorruptedXMLError(e) else: # We have not seen the initial <!xml declaration, so probably # the input data is not in XML format. raise NotXMLError(e) try: return self.object except AttributeError: if self.parser.StartElementHandler: # We saw the initial <!xml declaration, and expat didn't notice # any errors, so self.object should be defined. If not, this is # a bug. raise RuntimeError( "Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance." ) else: # We did not see the initial <!xml declaration, so probably # the input data is not in XML format. raise NotXMLError("XML declaration not found")
class ExonerateTextIndexer(_BaseExonerateIndexer): """Indexer class for Exonerate plain text.""" _parser = ExonerateTextParser _query_mark = _as_bytes('C4 Alignment') def get_qresult_id(self, pos): """Returns the query ID from the nearest "Query:" line.""" handle = self._handle handle.seek(pos) sentinel = _as_bytes('Query:') while True: line = handle.readline().strip() if line.startswith(sentinel): break if not line: raise StopIteration qid, desc = _parse_hit_or_query_line(_bytes_to_string(line)) return qid def get_raw(self, offset): """Returns the raw string of a QueryResult object from the given offset.""" handle = self._handle handle.seek(offset) qresult_key = None qresult_raw = _as_bytes('') while True: line = handle.readline() if not line: break elif line.startswith(self._query_mark): cur_pos = handle.tell() if qresult_key is None: qresult_key = self.get_qresult_id(cur_pos) else: curr_key = self.get_qresult_id(cur_pos) if curr_key != qresult_key: break handle.seek(cur_pos) qresult_raw += line return qresult_raw
def __iter__(self): qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' '</Iteration_query-ID>\s+?<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block #Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def __init__(self, filename, format, alphabet): SeqFileRandomAccess.__init__(self, filename, format, alphabet) marker = { "ace": "CO ", "embl": "ID ", "fasta": ">", "genbank": "LOCUS ", "gb": "LOCUS ", "imgt": "ID ", "phd": "BEGIN_SEQUENCE", "pir": ">..;", "qual": ">", "qual": ">", "swiss": "ID ", "uniprot-xml": "<entry ", }[format] self._marker = marker self._marker_re = re.compile(_as_bytes("^%s" % marker))
def read(self, handle): """Set up the parser and let it parse the XML results""" # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser # expects binary data if handle.__class__.__name__ == 'EvilHandleHack': handle = handle._handle if handle.__class__.__name__ == 'TextIOWrapper': handle = handle.buffer if hasattr(handle, "closed") and handle.closed: # Should avoid a possible Segmentation Fault, see: # http://bugs.python.org/issue4877 raise IOError("Can't parse a closed handle") if sys.version_info[0] >= 3: # Another nasty hack to cope with a unicode StringIO handle # since the Entrez XML parser expects binary data (bytes) from io import StringIO if isinstance(handle, StringIO): from io import BytesIO from SAP.Bio._py3k import _as_bytes handle = BytesIO(_as_bytes(handle.read())) try: self.parser.ParseFile(handle) except expat.ExpatError as e: if self.parser.StartElementHandler: # We saw the initial <!xml declaration, so we can be sure that # we are parsing XML data. Most likely, the XML file is # corrupted. raise CorruptedXMLError(e) else: # We have not seen the initial <!xml declaration, so probably # the input data is not in XML format. raise NotXMLError(e) try: return self.object except AttributeError: if self.parser.StartElementHandler: # We saw the initial <!xml declaration, and expat didn't notice # any errors, so self.object should be defined. If not, this is # a bug. raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance.") else: # We did not see the initial <!xml declaration, so probably # the input data is not in XML format. raise NotXMLError("XML declaration not found")
def __iter__(self): handle = self._handle handle.seek(0) tab_char = _as_bytes("\t") while True: start_offset = handle.tell() line = handle.readline() if not line: break # End of file try: key = line.split(tab_char)[0] except ValueError as err: if not line.strip(): # Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def _open(cgi, params=None, post=None, ecitmatch=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. The arugment post should be a boolean to explicitly control if an HTTP POST should be used rather an HTTP GET based on the query length. By default (post=None), POST is used if the URL encoded paramters would be over 1000 characters long. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current params = _construct_params(params) options = _encode_options(ecitmatch, params) # By default, post is None. Set to a boolean to over-ride length choice: if post is None and len(options) > 1000: post = True cgi = _construct_cgi(cgi, post, options) try: if post: handle = _urlopen(cgi, data=_as_bytes(options)) else: handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) # denotes column location for query identifier query_id_idx = 9 qresult_key = None tab_char = _as_bytes('\t') start_offset = handle.tell() line = handle.readline() # read through header # this assumes that the result row match the regex while not re.search(_RE_ROW_CHECK_IDX, line.strip()): start_offset = handle.tell() line = handle.readline() if not line: raise StopIteration # and index the qresults while True: end_offset = handle.tell() cols = [x for x in line.strip().split(tab_char) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break