def _set_qresult_hits(qresult, hit_rows=()): """Helper function for appending Hits without alignments into QueryResults.""" for hit_row in hit_rows: hit_id, remainder = hit_row.split(' ', 1) # TODO: parse hit and hsp properties properly; by dealing with: # - any character in the description (brackets, spaces, etc.) # - possible [f] or [r] presence (for frame info) # - possible presence of E2() column # - possible incomplete hit_id due to column length limit # The current method only looks at the Hit ID, none of the things above if hit_id not in qresult: frag = HSPFragment(hit_id, qresult.id) hsp = HSP([frag]) hit = Hit([hsp]) qresult.append(hit) return qresult
def _parse_hit(self, query_id): while True: self.line = self.handle.readline() if self.line.startswith('>>'): break state = _STATE_NONE strand = None hsp_list = [] while True: peekline = self.handle.peekline() # yield hit if we've reached the start of a new query or # the end of the search if peekline.strip() in [">>><<<", ">>>///"] or \ (not peekline.startswith('>>>') and '>>>' in peekline): # append last parsed_hsp['hit']['seq'] line if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.aln_annotation['similarity'] += \ self.line.strip('\r\n') # process HSP alignment and coordinates _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] break # yield hit and create a new one if we're still in the same query elif self.line.startswith('>>'): # try yielding, if we have hsps if hsp_list: _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] # try to get the hit id and desc, and handle cases without descs try: hit_id, hit_desc = self.line[2:].strip().split(' ', 1) except ValueError: hit_id = self.line[2:].strip().split(' ', 1)[0] hit_desc = '' # create the HSP object for Hit frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set or reset the state to none state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # create and append a new HSP if line starts with '>--' elif self.line.startswith('>--'): # set seq attributes of previous hsp _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) # and create a new one frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set the state ~ none yet state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # this is either query or hit data in the HSP, depending on the state elif self.line.startswith('>'): if state == _STATE_NONE: # make sure it's the correct query assert query_id.startswith(self.line[1:].split(' ')[0]), \ "%r vs %r" % (query_id, self.line) state = _STATE_QUERY_BLOCK parsed_hsp['query']['seq'] = '' elif state == _STATE_QUERY_BLOCK: # make sure it's the correct hit assert hit_id.startswith(self.line[1:].split(' ')[0]) state = _STATE_HIT_BLOCK parsed_hsp['hit']['seq'] = '' # check for conservation block elif self.line.startswith('; al_cons'): state = _STATE_CONS_BLOCK hsp.fragment.aln_annotation['similarity'] = '' elif self.line.startswith(';'): # Fasta outputs do not make a clear distinction between Hit # and HSPs, so we check the attribute names to determine # whether it belongs to a Hit or HSP regx = re.search(_RE_ATTR, self.line.strip()) name = regx.group(1) value = regx.group(2) # for values before the '>...' query block if state == _STATE_NONE: if name in _HSP_ATTR_MAP: attr_name, caster = _HSP_ATTR_MAP[name] if caster is not str: value = caster(value) if name in ['_ident', '_sim']: value *= 100 setattr(hsp, attr_name, value) # otherwise, pool the values for processing later elif state == _STATE_QUERY_BLOCK: parsed_hsp['query'][name] = value elif state == _STATE_HIT_BLOCK: if name == '_len': seq_len = int(value) else: parsed_hsp['hit'][name] = value # for values in the hit block else: raise ValueError("Unexpected line: %r" % self.line) # otherwise, it must be lines containing the sequences else: assert '>' not in self.line # if we're in hit, parse into hsp.hit if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_QUERY_BLOCK: parsed_hsp['query']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.fragment.aln_annotation['similarity'] += \ self.line.strip('\r\n') # we should not get here! else: raise ValueError("Unexpected line: %r" % self.line) self.line = self.handle.readline()
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 # initial value dummies qres_state = None file_state = None prev_qid = None cur, prev = None, None # container for Hit objects, used to create QueryResult hit_list = [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid # only parse the result row if it's not EOF # NOTE: we are not parsing the extra '#' lines appended to the end # of hmmer31b1 tabular results since storing them in qresult # objects means we can not do a single-pass parsing if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] else: file_state = state_EOF # mock value for cur_qid, since we have nothing to parse cur_qid = None if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME if prev is not None: # since domain tab formats only have 1 Hit per line # we always create HSPFragment, HSP, and Hit per line prev_hid = prev['hit']['id'] # create fragment and HSP and set their attributes frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) # create Hit and set its attributes hit = Hit([hsp]) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) hdesc = hdesc.strip() # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------') or line.startswith(' [No individual domains')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith(' [No individual domains') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): if attr == "description": cur_val = getattr(hit, attr) if cur_val and value and cur_val.startswith(value): continue setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(' ') if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # set query and hit descriptions if they are defined / nonempty string if qdesc: frag.query_description = qdesc if hdesc: frag.hit_description = hdesc # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)
def parse_hsps(self, hit_placeholders): """Parse a HMMER2 hsp block, beginning with the hsp table.""" # HSPs may occur in different order than the hits # so store Hit objects separately first unordered_hits = {} while self.read_next(): if self.line.startswith('Alignments') or \ self.line.startswith('Histogram') or \ self.line == '//': break if self.line.startswith('Model') or \ self.line.startswith('Sequence') or \ self.line.startswith('--------'): continue id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \ score, evalue = self.line.split() frag = HSPFragment(id_, self.qresult.id) frag.alphabet = generic_protein if self._meta['program'] == 'hmmpfam': frag.hit_start = int(hmm_f) - 1 frag.hit_end = int(hmm_t) frag.query_start = int(seq_f) - 1 frag.query_end = int(seq_t) elif self._meta['program'] == 'hmmsearch': frag.query_start = int(hmm_f) - 1 frag.query_end = int(hmm_t) frag.hit_start = int(seq_f) - 1 frag.hit_end = int(seq_t) hsp = HSP([frag]) hsp.evalue = float(evalue) hsp.bitscore = float(score) hsp.domain_index = int(domain.split('/')[0]) if self._meta['program'] == 'hmmpfam': hsp.hit_endtype = hmm_compl hsp.query_endtype = seq_compl elif self._meta['program'] == 'hmmsearch': hsp.query_endtype = hmm_compl hsp.hit_endtype = seq_compl if id_ not in unordered_hits: placeholder = [p for p in hit_placeholders if p.id_ == id_][0] hit = placeholder.createHit([hsp]) unordered_hits[id_] = hit else: hit = unordered_hits[id_] hsp.hit_description = hit.description hit.append(hsp) # The placeholder list is in the correct order, so use that order for # the Hit objects in the qresult for p in hit_placeholders: self.qresult.append(unordered_hits[p.id_])
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand # if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list(zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list(zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp