def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): """Returns (id,offset) tuples.""" marker_offset = len(self._marker) marker_re = self._marker_re handle = self._handle handle.seek(0) # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # Here we can assume the record.id is the first word after the # marker. This is generally fine... but not for GenBank, EMBL, Swiss id = line[marker_offset:].strip().split(None, 1)[0] length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(id), start_offset, length start_offset = end_offset break else: # Track this explicitly as can't do file offset difference on BGZF length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We cannot assume the record.id is the first word after ID, #normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def __iter__(self): """Returns (id,offset) tuples.""" marker_offset = len(self._marker) marker_re = self._marker_re handle = self._handle handle.seek(0) #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #Here we can assume the record.id is the first word after the #marker. This is generally fine... but not for GenBank, EMBL, Swiss id = line[marker_offset:].strip().split(None, 1)[0] length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(id), start_offset, length start_offset = end_offset break else: #Track this explicitly as can't do file offset difference on BGZF length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We cannot assume the record.id is the first word after ID, # normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def get_qresult_id(self, pos): """Returns the query ID of the nearest cigar line.""" handle = self._handle handle.seek(pos) # get line, check if it's a vulgar line, and get query ID line = handle.readline() assert line.startswith(self._query_mark), line id = re.search(_RE_CIGAR, _bytes_to_string(line)) return id.group(1)
def __iter__(self): qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile( _as_bytes(r'<Iteration_query-ID>(.*?)' '</Iteration_query-ID>\s+?<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count( qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count( qend_mark) == 1, "XML without line breaks? %r" % block #Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield _bytes_to_string(qresult_key), start_offset, \ adj_end - start_offset qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: #Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: #assert line[0]=="@" #This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] #Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") #assert line[0]=="+" #Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: #Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) #Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) if not self._kwargs['comments']: iterfunc = self._qresult_index else: iterfunc = self._qresult_index_commented for key, offset, length in iterfunc(): yield _bytes_to_string(key), offset, length
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) # denotes column location for query identifier query_id_idx = 9 qresult_key = None tab_char = _as_bytes('\t') start_offset = handle.tell() line = handle.readline() # read through header # this assumes that the result row match the regex while not re.search(_RE_ROW_CHECK_IDX, line.strip()): start_offset = handle.tell() line = handle.readline() if not line: raise StopIteration # and index the qresults while True: end_offset = handle.tell() cols = [x for x in line.strip().split(tab_char) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def _parse_tag_data(elem_code, elem_num, raw_data): """Returns single data value. elem_code - What kind of data elem_num - How many data points raw_data - abi file object from which the tags would be unpacked """ if elem_code in _BYTEFMT: # because '>1s' unpack differently from '>s' if elem_num == 1: num = "" else: num = str(elem_num) fmt = ">" + num + _BYTEFMT[elem_code] assert len(raw_data) == struct.calcsize(fmt) data = struct.unpack(fmt, raw_data) # no need to use tuple if len(data) == 1 # also if data is date / time if elem_code not in [10, 11] and len(data) == 1: data = data[0] # account for different data types if elem_code == 2: return _bytes_to_string(data) elif elem_code == 10: return str(datetime.date(*data)) elif elem_code == 11: return str(datetime.time(*data[:3])) elif elem_code == 13: return bool(data) elif elem_code == 18: return _bytes_to_string(data[1:]) elif elem_code == 19: return _bytes_to_string(data[:-1]) else: return data else: return None
def _parse_tag_data(elem_code, elem_num, raw_data): """Returns single data value. elem_code - What kind of data elem_num - How many data points raw_data - abi file object from which the tags would be unpacked """ if elem_code in _BYTEFMT: # because '>1s' unpack differently from '>s' if elem_num == 1: num = '' else: num = str(elem_num) fmt = '>' + num + _BYTEFMT[elem_code] assert len(raw_data) == struct.calcsize(fmt) data = struct.unpack(fmt, raw_data) # no need to use tuple if len(data) == 1 # also if data is date / time if elem_code not in [10, 11] and len(data) == 1: data = data[0] # account for different data types if elem_code == 2: return _bytes_to_string(data) elif elem_code == 10: return str(datetime.date(*data)) elif elem_code == 11: return str(datetime.time(*data[:3])) elif elem_code == 13: return bool(data) elif elem_code == 18: return _bytes_to_string(data[1:]) elif elem_code == 19: return _bytes_to_string(data[:-1]) else: return data else: return None
def __iter__(self): qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' '</Iteration_query-ID>\s+?<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block #Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def _abi_parse_header(header, handle): """Generator that returns directory contents. """ # header structure (after ABIF marker): # file version, tag name, tag number, # element type code, element size, number of elements # data size, data offset, handle (not file handle) head_elem_size = header[4] head_elem_num = header[5] head_offset = header[7] index = 0 while index < head_elem_num: start = head_offset + index * head_elem_size # add directory offset to tuple # to handle directories with data size <= 4 bytes handle.seek(start) dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (start,) index += 1 # only parse desired dirs key = _bytes_to_string(dir_entry[0]) key += str(dir_entry[1]) if key in (list(_EXTRACT) + _SPCTAGS): tag_name = _bytes_to_string(dir_entry[0]) tag_number = dir_entry[1] elem_code = dir_entry[2] elem_num = dir_entry[4] data_size = dir_entry[5] data_offset = dir_entry[6] tag_offset = dir_entry[8] # if data size <= 4 bytes, data is stored inside tag # so offset needs to be changed if data_size <= 4: data_offset = tag_offset + 20 handle.seek(data_offset) data = handle.read(data_size) yield tag_name, tag_number, \ _parse_tag_data(elem_code, elem_num, data)
def get(self, offset): #TODO - Can we handle this directly in the parser? #This is a hack - use get_raw for <entry>...</entry> and wrap it with #the apparently required XML header and footer. data = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(self.get_raw(offset)) #TODO - For consistency, this function should not accept a string: return next(SeqIO.UniprotIO.UniprotIterator(data))
def _abi_parse_header(header, handle): """Generator that returns directory contents. """ # header structure (after ABIF marker): # file version, tag name, tag number, # element type code, element size, number of elements # data size, data offset, handle (not file handle) head_elem_size = header[4] head_elem_num = header[5] head_offset = header[7] index = 0 while index < head_elem_num: start = head_offset + index * head_elem_size # add directory offset to tuple # to handle directories with data size <= 4 bytes handle.seek(start) dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (start,) index += 1 # only parse desired dirs key = _bytes_to_string(dir_entry[0]) key += str(dir_entry[1]) if key in (list(_EXTRACT) + _SPCTAGS): tag_name = _bytes_to_string(dir_entry[0]) tag_number = dir_entry[1] elem_code = dir_entry[2] elem_num = dir_entry[4] data_size = dir_entry[5] data_offset = dir_entry[6] tag_offset = dir_entry[8] # if data size <= 4 bytes, data is stored inside tag # so offset needs to be changed if data_size <= 4: data_offset = tag_offset + 20 handle.seek(data_offset) data = handle.read(data_size) yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
def get_qresult_id(self, pos): """Returns the query ID from the nearest "Query:" line.""" handle = self._handle handle.seek(pos) sentinel = _as_bytes('Query:') while True: line = handle.readline().strip() if line.startswith(sentinel): break if not line: raise StopIteration qid, desc = _parse_hit_or_query_line(_bytes_to_string(line)) return qid
def get(self, offset): # TODO - Can we handle this directly in the parser? # This is a hack - use get_raw for <entry>...</entry> and wrap it with # the apparently required XML header and footer. data = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string( self.get_raw(offset) ) # TODO - For consistency, this function should not accept a string: return next(SeqIO.UniprotIO.UniprotIterator(data))
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We expect the next line to be <accession>xxx</accession> #(possibly with leading spaces) #but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11:].split( less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: #Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError( "Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length #Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after ID, #normally the SV line is used. length = len(line) if line[2:].count(semi_char) == 6: #Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): #The SV bit gives the version key = parts[0].strip() + dot_char + \ parts[1].strip().split()[1] else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: #Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] else: raise ValueError('Did not recognise the ID line layout:\n' + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(sv_marker): key = line.rstrip().split()[1] length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) tab_char = _as_bytes("\t") while True: start_offset = handle.tell() line = handle.readline() if not line: break # End of file try: key = line.split(tab_char)[0] except ValueError as err: if not line.strip(): #Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We expect the next line to be <accession>xxx</accession> # (possibly with leading spaces) # but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11 :].split(less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: # Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length # Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. length = len(line) if line[2:].count(semi_char) == 6: # Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + dot_char + parts[1].strip().split()[1] else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: # Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] else: raise ValueError("Did not recognise the ID line layout:\n" + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(sv_marker): key = line.rstrip().split()[1] length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) tab_char = _as_bytes("\t") while True: start_offset = handle.tell() line = handle.readline() if not line: break # End of file try: key = line.split(tab_char)[0] except ValueError as err: if not line.strip(): # Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after LOCUS, #normally the first entry on the VERSION or ACCESSION line is used. key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError( "Did not find ACCESSION/VERSION lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char) == 1 and version_id.split( dot_char)[1].isdigit(): #This should mimic the GenBank parser... key = version_id length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") while True: offset = handle.tell() line = handle.readline() length = len(line) if marker_re.match(line): # Now look for the first line which doesn't start ";" while True: line = handle.readline() if line[0:1] != semi_char and line.strip(): key = line.split()[0] yield _bytes_to_string(key), offset, length break if not line: raise ValueError("Premature end of file?") length += len(line) elif not line: # End of file break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") while True: offset = handle.tell() line = handle.readline() length = len(line) if marker_re.match(line): #Now look for the first line which doesn't start ";" while True: line = handle.readline() if line[0:1] != semi_char and line.strip(): key = line.split()[0] yield _bytes_to_string(key), offset, length break if not line: raise ValueError("Premature end of file?") length += len(line) elif not line: #End of file break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after LOCUS, # normally the first entry on the VERSION or ACCESSION line is used. key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError("Did not find ACCESSION/VERSION lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): # This should mimic the GenBank parser... key = version_id length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() qresult_key = None query_mark = _as_bytes('>>>') while True: line = handle.readline() peekline = handle.peekline() end_offset = handle.tell() if not line.startswith(query_mark) and query_mark in line: regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) qresult_key = _bytes_to_string(regx.group(1)) start_offset = end_offset - len(line) # yield whenever we encounter a new query or at the end of the file if qresult_key is not None: if (not peekline.startswith(query_mark) and query_mark in peekline) or not line: yield qresult_key, start_offset, end_offset - start_offset if not line: break start_offset = end_offset
def get(self, offset): """Returns SeqRecord.""" #Should be overridden for binary file formats etc: return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
def get(self, offset): return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
def get(self, offset): """Returns SeqRecord.""" # Should be overridden for binary file formats etc: return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))