def __iter__(self): """Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0.""" handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(rb"Query\s*(?:sequence|HMM)?:\s*(.*)") # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(b"hmmsearch"): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield qresult_key.decode(), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield qresult_key.decode(), start_offset, 0 break line = read_forward(handle)
def _parse_hit_block(self): """Parse a hit block (PRIVATE).""" self.line = read_forward(self.handle) match = re.search(_RE_HIT_BLOCK_DESC, self.line) if not match: raise RuntimeError( f"Unexpected content in HIT_BLOCK_DESC line'{self.line}'" ) hit_data = { "hit_id": match.group(1), "description": match.group(2).lstrip(" ;"), "evalue": None, "hit_start": None, "hit_end": None, "hit_seq": "", "prob": None, "query_start": None, "query_end": None, "query_seq": "", "score": None, } self.line = self.handle.readline() self._process_score_line(self.line, hit_data) while True: self.line = read_forward(self.handle) if not self.line.strip() or self.line.startswith(_END_OF_FILE_MARKER): # _END_OF_FILE_MARKER isn't always present self.done = True return hit_data elif re.search(_RE_HIT_BLOCK_START, self.line): return hit_data else: self._parse_hit_match_block(hit_data)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def _parse_hit(self, qid): """Parses a HMMER3 hit block, beginning with the hit table.""" # get to the end of the hit table delimiter and read one more line self._read_until(lambda line: line.startswith(' ------- ------ -----')) self.line = read_forward(self.handle) # assume every hit is in inclusion threshold until the inclusion # threshold line is encountered is_included = True # parse the hit table hit_list = [] while True: if not self.line: return hit_list elif self.line.startswith(' ------ inclusion'): is_included = False self.line = read_forward(self.handle) # if there are no hits, then there are no hsps # so we forward-read until 'Internal pipeline..' elif self.line.startswith(' [No hits detected that satisfy ' 'reporting'): while True: self.line = read_forward(self.handle) if self.line.startswith('Internal pipeline'): assert len(hit_list) == 0 return hit_list elif self.line.startswith('Domain annotation for each '): hit_list = self._create_hits(hit_list, qid) return hit_list # entering hit results row # parse the columns into a list row = filter(None, self.line.strip().split(' ')) # join the description words if it's >1 word if len(row) > 10: row[9] = ' '.join(row[9:]) # if there's no description, set it to an empty string elif len(row) < 10: row.append('') assert len(row) == 10 # create the hit object hit_attrs = { 'id': row[8], 'query_id': qid, 'evalue': float(row[0]), 'bitscore': float(row[1]), 'bias': float(row[2]), # row[3:6] is not parsed, since the info is available # at the the HSP level 'domain_exp_num': float(row[6]), 'domain_obs_num': int(row[7]), 'description': row[9], 'is_included': is_included, } hit_list.append(hit_attrs) self.line = read_forward(self.handle)
def _parse_hit(self, qid, qdesc): """Parse a HMMER3 hit block, beginning with the hit table (PRIVATE).""" # get to the end of the hit table delimiter and read one more line self._read_until( lambda line: line.startswith(" ------- ------ -----")) self.line = read_forward(self.handle) # assume every hit is in inclusion threshold until the inclusion # threshold line is encountered is_included = True # parse the hit table hit_attr_list = [] while True: if not self.line: return [] elif self.line.startswith(" ------ inclusion"): is_included = False self.line = read_forward(self.handle) # if there are no hits, then there are no hsps # so we forward-read until 'Internal pipeline..' elif self.line.startswith( " [No hits detected that satisfy reporting"): while True: self.line = read_forward(self.handle) if self.line.startswith("Internal pipeline"): assert len(hit_attr_list) == 0 return [] elif self.line.startswith("Domain annotation for each "): hit_list = self._create_hits(hit_attr_list, qid, qdesc) return hit_list # entering hit results row # parse the columns into a list row = [x for x in self.line.strip().split(" ") if x] # join the description words if it's >1 word if len(row) > 10: row[9] = " ".join(row[9:]) # if there's no description, set it to an empty string elif len(row) < 10: row.append("") assert len(row) == 10 # create the hit object hit_attrs = { "id": row[8], "query_id": qid, "evalue": float(row[0]), "bitscore": float(row[1]), "bias": float(row[2]), # row[3:6] is not parsed, since the info is available # at the HSP level "domain_exp_num": float(row[6]), "domain_obs_num": int(row[7]), "description": row[9], "is_included": is_included, } hit_attr_list.append(hit_attrs) self.line = read_forward(self.handle)
def _read_until(self, bool_func): """Reads the file handle until the given function returns True.""" while True: if not self.line or bool_func(self.line): return else: self.line = read_forward(self.handle)
def __init__(self, handle): """Initialize the class.""" self.handle = handle self.line = read_forward(self.handle) self.done = False self.query_id = None self.seq_len = None
def _read_until(self, bool_func): """Read the file handle until the given function returns True (PRIVATE).""" while True: if not self.line or bool_func(self.line): return else: self.line = read_forward(self.handle)
def _parse_qresult(self): """Parses a HMMER3 query block.""" self._read_until(lambda line: line.startswith('Query:')) while self.line: # get query id and length regx = re.search(_QRE_ID_LEN, self.line) qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { 'seq_len': int(regx.group(2)), 'program': self._meta.get('program'), 'version': self._meta.get('version'), 'target': self._meta.get('target'), } # get description and accession, if they exist desc = '' # placeholder while not self.line.startswith('Scores for '): self.line = read_forward(self.handle) if self.line.startswith('Accession:'): acc = self.line.strip().split(' ', 1)[1] qresult_attrs['accession'] = acc.strip() elif self.line.startswith('Description:'): desc = self.line.strip().split(' ', 1)[1] qresult_attrs['description'] = desc.strip() # parse the query hits while self.line and '//' not in self.line: hit_list = self._parse_hit(qid) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith('Internal pipeline'): while self.line and '//' not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield qresult = QueryResult(qid, hits=hit_list) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle)
def _read_until(self, bool_func, stop_on_blank=True, max_read_until=MAX_READ_UNTIL): """Read the file handle until the given function returns True (PRIVATE).""" count = 0 while True: if stop_on_blank and not self.line: return if bool_func(self.line): return else: self.line = read_forward(self.handle) count += 1 if count >= max_read_until: raise RuntimeError("Exceeded max_read_until in _read_until")
def _parse_hit_block(self): """Parse a hit block (PRIVATE).""" self.line = read_forward(self.handle) match = re.search(_RE_HIT_BLOCK_DESC, self.line) if not match: raise RuntimeError( "Unexpected content in HIT_BLOCK_DESC line'{}'".format( self.line)) hit_data = { 'hit_id': match.group(1), 'description': match.group(2).lstrip(' ;'), 'evalue': None, 'hit_start': None, 'hit_end': None, 'hit_seq': '', 'hit_seq_len': None, 'prob': None, 'query_start': None, 'query_end': None, 'query_seq': '', 'score': None, 'text': '' } self.line = self.handle.readline() self._process_score_line(self.line, hit_data) while True: self.line = read_forward(self.handle) if not self.line.strip() or self.line.startswith( _END_OF_FILE_MARKER): # _END_OF_FILE_MARKER isn't always present self.done = True return hit_data elif re.search(_RE_HIT_BLOCK_START, self.line): return hit_data else: self._parse_hit_match_block(hit_data)
def _parse_preamble(self): """Parse HMMER preamble (lines beginning with '#') (PRIVATE).""" meta = {} # bool flag for storing state ~ whether we are parsing the option # lines or not has_opts = False while True: # no pound sign means we've left the preamble if not self.line.startswith("#"): break # dashes could either mean we are entering or leaving the options # section ~ so it's a switch for the has_opts flag elif "- - -" in self.line: if not has_opts: # if flag is false, that means we're entering opts # so switch the flag accordingly has_opts = True else: # if flag is true, that means we've reached the end of opts # so we can break out of the function break elif not has_opts: # try parsing program regx = re.search(_RE_PROGRAM, self.line) if regx: meta["program"] = regx.group(1) # try parsing version regx = re.search(_RE_VERSION, self.line) if regx: meta["version"] = regx.group(1) elif has_opts: regx = re.search(_RE_OPT, self.line) # if target in regx.group(1), then we store the key as target if "target" in regx.group(1): meta["target"] = regx.group(2).strip() else: meta[regx.group(1)] = regx.group(2) self.line = read_forward(self.handle) return meta
def _parse_preamble(self): """Parses HMMER preamble (lines beginning with '#').""" meta = {} # bool flag for storing state ~ whether we are parsing the option # lines or not has_opts = False while True: # no pound sign means we've left the preamble if not self.line.startswith('#'): break # dashes could either mean we are entering or leaving the options # section ~ so it's a switch for the has_opts flag elif '- - -' in self.line: if not has_opts: # if flag is false, that means we're entering opts # so switch the flag accordingly has_opts = True else: # if flag is true, that means we've reached the end of opts # so we can break out of the function break elif not has_opts: # try parsing program regx = re.search(_RE_PROGRAM, self.line) if regx: meta['program'] = regx.group(1) # try parsing version regx = re.search(_RE_VERSION, self.line) if regx: meta['version'] = regx.group(1) elif has_opts: regx = re.search(_RE_OPT, self.line) # if target in regx.group(1), then we store the key as target if 'target' in regx.group(1): meta['target'] = regx.group(2) else: meta[regx.group(1)] = regx.group(2) self.line = read_forward(self.handle) return meta
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): """Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0.""" handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_QRE_ID_LEN_PTN.encode()) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield qresult_key.decode(), start_offset, 0 start_offset = end_offset elif not line: break
def _parse_qresult(self): """Parse a HMMER3 query block (PRIVATE).""" self._read_until(lambda line: line.startswith("Query:")) while self.line: regx = re.search(_QRE_ID_LEN, self.line) while not regx: self.line = read_forward(self.handle) regx = re.search(_QRE_ID_LEN, self.line) # get query id and length qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { "seq_len": int(regx.group(2)), "program": self._meta.get("program"), "version": self._meta.get("version"), "target": self._meta.get("target"), } # get description and accession, if they exist qdesc = "<unknown description>" # placeholder while not self.line.startswith("Scores for "): self.line = read_forward(self.handle) if self.line.startswith("Accession:"): acc = self.line.strip().split(" ", 1)[1] qresult_attrs["accession"] = acc.strip() elif self.line.startswith("Description:"): qdesc = self.line.strip().split(" ", 1)[1].strip() qresult_attrs["description"] = qdesc # parse the query hits while self.line and "//" not in self.line: hit_list = self._parse_hit(qid, qdesc) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith("Internal pipeline"): while self.line and "//" not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield # not initializing hit_list directly to handle empty hits # (i.e. need to set its query description manually) qresult = QueryResult(id=qid, hits=hit_list) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle) # Skip line beginning with '# Alignment of', which are output # when running phmmer with the '-A' flag. if self.line.startswith("#"): self.line = self.handle.readline() # HMMER >= 3.1 outputs '[ok]' at the end of all results file, # which means we can break the main loop when we see the line if "[ok]" in self.line: break
def __init__(self, handle): """Initialize the class.""" self.handle = handle self.line = read_forward(self.handle) self._meta = self._parse_preamble()
def _parse_aln_block(self, hid, hsp_list): """Parse a HMMER3 HSP alignment block (PRIVATE).""" self.line = read_forward(self.handle) dom_counter = 0 while True: if self.line.startswith(">>") or self.line.startswith( "Internal pipeline"): return hsp_list assert self.line.startswith(" == domain %i" % (dom_counter + 1)) # alias hsp to local var # but note that we're still changing the attrs of the actual # hsp inside the qresult as we're not creating a copy frag = hsp_list[dom_counter][0] # XXX: should we validate again here? regex is expensive.. # regx = re.search(_HRE_VALIDATE, self.line) # assert hsp.bitscore == float(regx.group(1)) # assert hsp.evalue_cond == float(regx.group(2)) hmmseq = "" aliseq = "" annot = {} self.line = self.handle.readline() # parse all the alignment blocks in the hsp while True: regx = None # check for hit or query line # we don't check for the hit or query id specifically # to anticipate special cases where query id == hit id regx = re.search(_HRE_ID_LINE, self.line) if regx: # the first hit/query self.line we encounter is the hmmseq if len(hmmseq) == len(aliseq): hmmseq += regx.group(2) # and for subsequent self.lines, len(hmmseq) is either # > or == len(aliseq) elif len(hmmseq) > len(aliseq): aliseq += regx.group(2) assert len(hmmseq) >= len(aliseq) # check for start of new domain elif (self.line.startswith(" == domain") or self.line.startswith(">>") or self.line.startswith("Internal pipeline")): frag.aln_annotation = annot if self._meta.get("program") == "hmmscan": frag.hit = hmmseq frag.query = aliseq elif self._meta.get("program") in ["hmmsearch", "phmmer"]: frag.hit = aliseq frag.query = hmmseq dom_counter += 1 hmmseq = "" aliseq = "" annot = {} break # otherwise check if it's an annotation line and parse it # len(hmmseq) is only != len(aliseq) when the cursor is parsing # the similarity character. Since we're not parsing that, we # check for when the condition is False (i.e. when it's ==) elif len(hmmseq) == len(aliseq): regx = re.search(_HRE_ANNOT_LINE, self.line) if regx: annot_name = regx.group(3) if annot_name in annot: annot[annot_name] += regx.group(2) else: annot[annot_name] = regx.group(2) self.line = self.handle.readline()
def _create_hits(self, hit_attrs, qid, qdesc): """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE).""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith("Internal pipeline") or line.startswith(">>")) # start parsing the hsp block hit_list = [] while True: if self.line.startswith("Internal pipeline"): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith(">>") hid, hdesc = self.line[len(">> "):].split(" ", 1) hdesc = hdesc.strip() # read through the hsp table header and move one more line self._read_until( lambda line: line.startswith(" --- ------ ----- --------") or line.startswith(" [No individual domains")) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if (self.line.startswith( " [No targets detected that satisfy") or self.line.startswith(" [No individual domains") or self.line.startswith( "Internal pipeline statistics summary:") or self.line.startswith(" Alignments for each domain:") or self.line.startswith(">>")): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): if attr == "description": cur_val = getattr(hit, attr) if cur_val and value and cur_val.startswith(value): continue setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(" ") if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # set query and hit descriptions if they are defined / nonempty string if qdesc: frag.query_description = qdesc if hdesc: frag.hit_description = hdesc # HMMER3 results are always protein frag.molecule_type = "protein" # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == "!" hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(" Alignments for each domain:"): self._parse_aln_block(hid, hit.hsps)
def _create_hits(self, hit_attrs, qid): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): setattr(hit, attr, value) hit_list.append(hit) break parsed = filter(None, self.line.strip().split(' ')) assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)
def _parse_aln_block(self, hid, hsp_list): """Parses a HMMER3 HSP alignment block.""" self.line = read_forward(self.handle) dom_counter = 0 while True: if self.line.startswith('>>') or \ self.line.startswith('Internal pipeline'): return hsp_list assert self.line.startswith(' == domain %i' % (dom_counter + 1)) # alias hsp to local var # but note that we're still changing the attrs of the actual # hsp inside the qresult as we're not creating a copy frag = hsp_list[dom_counter][0] # XXX: should we validate again here? regex is expensive.. #regx = re.search(_HRE_VALIDATE, self.line) #assert hsp.bitscore == float(regx.group(1)) #assert hsp.evalue_cond == float(regx.group(2)) hmmseq = '' aliseq = '' annot = {} self.line = self.handle.readline() # parse all the alignment blocks in the hsp while True: regx = None # check for hit or query line # we don't check for the hit or query id specifically # to anticipate special cases where query id == hit id regx = re.search(_HRE_ID_LINE, self.line) if regx: # the first hit/query self.line we encounter is the hmmseq if len(hmmseq) == len(aliseq): hmmseq += regx.group(2) # and for subsequent self.lines, len(hmmseq) is either # > or == len(aliseq) elif len(hmmseq) > len(aliseq): aliseq += regx.group(2) assert len(hmmseq) >= len(aliseq) # check for start of new domain elif self.line.startswith(' == domain') or \ self.line.startswith('>>') or \ self.line.startswith('Internal pipeline'): frag.aln_annotation = annot if self._meta.get('program') == 'hmmscan': frag.hit = hmmseq frag.query = aliseq elif self._meta.get('program') in ['hmmsearch', 'phmmer']: frag.hit = aliseq frag.query = hmmseq dom_counter += 1 hmmseq = '' aliseq = '' annot = {} break # otherwise check if it's an annotation line and parse it # len(hmmseq) is only != len(aliseq) when the cursor is parsing # the homology character. Since we're not parsing that, we # check for when the condition is False (i.e. when it's ==) elif len(hmmseq) == len(aliseq): regx = re.search(_HRE_ANNOT_LINE, self.line) if regx: annot_name = regx.group(3) if annot_name in annot: annot[annot_name] += regx.group(2) else: annot[annot_name] = regx.group(2) self.line = self.handle.readline()
def __init__(self, handle): self.handle = handle self.line = read_forward(self.handle) self._meta = self._parse_preamble()