def createHit(self, hsp_list): hit = Hit(hsp_list) hit.id_ = self.id_ hit.evalue = self.evalue hit.bitscore = self.bitscore if self.description: hit.description = self.description hit.domain_obs_num = self.domain_obs_num return hit
def _set_qresult_hits(qresult, hit_rows=[]): """Helper function for appending Hits without alignments into QueryResults.""" for hit_row in hit_rows: hit_id, remainder = hit_row.split(' ', 1) # TODO: parse hit and hsp properties properly; by dealing with: # - any character in the description (brackets, spaces, etc.) # - possible [f] or [r] presence (for frame info) # - possible presence of E2() column # - possible incomplete hit_id due to column length limit # The current method only looks at the Hit ID, none of the things above if hit_id not in qresult: frag = HSPFragment(hit_id, qresult.id) hsp = HSP([frag]) hit = Hit([hsp]) qresult.append(hit) return qresult
def _parse_hit(self, query_id): while True: self.line = self.handle.readline() if self.line.startswith('>>'): break strand = None hsp_list = [] while True: peekline = self.handle.peekline() # yield hit if we've reached the start of a new query or # the end of the search if peekline.strip() in [">>><<<", ">>>///"] or \ (not peekline.startswith('>>>') and '>>>' in peekline): # append last parsed_hsp['hit']['seq'] line if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.aln_annotation['similarity'] += \ self.line.strip('\r\n') # process HSP alignment and coordinates _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] break # yield hit and create a new one if we're still in the same query elif self.line.startswith('>>'): # try yielding, if we have hsps if hsp_list: _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] # try to get the hit id and desc, and handle cases without descs try: hit_id, hit_desc = self.line[2:].strip().split(' ', 1) except ValueError: hit_id = self.line[2:].strip().split(' ', 1)[0] hit_desc = '' # create the HSP object for Hit frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set or reset the state to none state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # create and append a new HSP if line starts with '>--' elif self.line.startswith('>--'): # set seq attributes of previous hsp _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) # and create a new one frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set the state ~ none yet state = _STATE_NONE parsed_hsp = {'query': {}, 'hit': {}} # this is either query or hit data in the HSP, depending on the state elif self.line.startswith('>'): if state == _STATE_NONE: # make sure it's the correct query assert query_id.startswith(self.line[1:].split(' ')[0]), \ "%r vs %r" % (query_id, self.line) state = _STATE_QUERY_BLOCK parsed_hsp['query']['seq'] = '' elif state == _STATE_QUERY_BLOCK: # make sure it's the correct hit assert hit_id.startswith(self.line[1:].split(' ')[0]) state = _STATE_HIT_BLOCK parsed_hsp['hit']['seq'] = '' # check for conservation block elif self.line.startswith('; al_cons'): state = _STATE_CONS_BLOCK hsp.fragment.aln_annotation['similarity'] = '' elif self.line.startswith(';'): # Fasta outputs do not make a clear distinction between Hit # and HSPs, so we check the attribute names to determine # whether it belongs to a Hit or HSP regx = re.search(_RE_ATTR, self.line.strip()) name = regx.group(1) value = regx.group(2) # for values before the '>...' query block if state == _STATE_NONE: if name in _HSP_ATTR_MAP: attr_name, caster = _HSP_ATTR_MAP[name] if caster is not str: value = caster(value) if name in ['_ident', '_sim']: value *= 100 setattr(hsp, attr_name, value) # otherwise, pool the values for processing later elif state == _STATE_QUERY_BLOCK: parsed_hsp['query'][name] = value elif state == _STATE_HIT_BLOCK: if name == '_len': seq_len = int(value) else: parsed_hsp['hit'][name] = value # for values in the hit block else: raise ValueError("Unexpected line: %r" % self.line) # otherwise, it must be lines containing the sequences else: assert '>' not in self.line # if we're in hit, parse into hsp.hit if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_QUERY_BLOCK: parsed_hsp['query']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.fragment.aln_annotation['similarity'] += \ self.line.strip('\r\n') # we should not get here! else: raise ValueError("Unexpected line: %r" % self.line) self.line = self.handle.readline()
def _parse_hit(self, query_id): while True: self.line = self.handle.readline() if self.line.startswith('>>'): break strand = None hsp_list = [] while True: peekline = self.handle.peekline() # yield hit if we've reached the start of a new query or # the end of the search if peekline.strip() in [">>><<<", ">>>///"] or \ (not peekline.startswith('>>>') and '>>>' in peekline): # append last parsed_hsp['hit']['seq'] line if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.aln_annotation['similarity'] += \ self.line.strip('\r\n') # process HSP alignment and coordinates _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] break # yield hit and create a new one if we're still in the same query elif self.line.startswith('>>'): # try yielding, if we have hsps if hsp_list: _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) hit = Hit(hsp_list) hit.description = hit_desc hit.seq_len = seq_len yield hit, strand hsp_list = [] # try to get the hit id and desc, and handle cases without descs try: hit_id, hit_desc = self.line[2:].strip().split(' ', 1) except ValueError: hit_id = self.line[2:].strip().split(' ', 1)[0] hit_desc = '' # create the HSP object for Hit frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set or reset the state to none state = _STATE_NONE parsed_hsp = {'query':{}, 'hit': {}} # create and append a new HSP if line starts with '>--' elif self.line.startswith('>--'): # set seq attributes of previous hsp _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program']) # and create a new one frag = HSPFragment(hit_id, query_id) hsp = HSP([frag]) hsp_list.append(hsp) # set the state ~ none yet state = _STATE_NONE parsed_hsp = {'query':{}, 'hit': {}} # this is either query or hit data in the HSP, depending on the state elif self.line.startswith('>'): if state == _STATE_NONE: # make sure it's the correct query assert query_id.startswith(self.line[1:].split(' ')[0]), \ "%r vs %r" % (query_id, self.line) state = _STATE_QUERY_BLOCK parsed_hsp['query']['seq'] = '' elif state == _STATE_QUERY_BLOCK: # make sure it's the correct hit assert hit_id.startswith(self.line[1:].split(' ')[0]) state = _STATE_HIT_BLOCK parsed_hsp['hit']['seq'] = '' # check for conservation block elif self.line.startswith('; al_cons'): state = _STATE_CONS_BLOCK hsp.fragment.aln_annotation['similarity'] = '' elif self.line.startswith(';'): # Fasta outputs do not make a clear distinction between Hit # and HSPs, so we check the attribute names to determine # whether it belongs to a Hit or HSP regx = re.search(_RE_ATTR, self.line.strip()) name = regx.group(1) value = regx.group(2) # for values before the '>...' query block if state == _STATE_NONE: if name in _HSP_ATTR_MAP: attr_name, caster = _HSP_ATTR_MAP[name] if caster is not str: value = caster(value) if name in ['_ident', '_sim']: value *= 100 setattr(hsp, attr_name, value) # otherwise, pool the values for processing later elif state == _STATE_QUERY_BLOCK: parsed_hsp['query'][name] = value elif state == _STATE_HIT_BLOCK: if name == '_len': seq_len = int(value) else: parsed_hsp['hit'][name] = value # for values in the hit block else: raise ValueError("Unexpected line: %r" % self.line) # otherwise, it must be lines containing the sequences else: assert '>' not in self.line # if we're in hit, parse into hsp.hit if state == _STATE_HIT_BLOCK: parsed_hsp['hit']['seq'] += self.line.strip() elif state == _STATE_QUERY_BLOCK: parsed_hsp['query']['seq'] += self.line.strip() elif state == _STATE_CONS_BLOCK: hsp.fragment.aln_annotation['similarity'] += \ self.line.strip('\r\n') # we should not get here! else: raise ValueError("Unexpected line: %r" % self.line) self.line = self.handle.readline()
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, used to determine what to do with each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values if we've past the first line if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF or not a comment line if self.line and not self.line.startswith('#'): cur = self._parse_result_row() cur_qid = self._get_id(cur['qresult']) cur_hid = self._get_id(cur['hit']) else: file_state = state_EOF # mock values for cur_qid and cur_hid since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different id or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # we're creating objects for the previously parsed line(s), # so nothing is done in the first parsed line (prev == None) if prev is not None: # every line is essentially an HSP with one fragment, so we # create both of these for every line frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): # adjust coordinates to Python range # NOTE: this requires both start and end coords to be # present, otherwise a KeyError will be raised. # Without this limitation, we might misleadingly set the # start / end coords for seq_type in ('query', 'hit'): if attr == seq_type + '_start': value = min(value, prev['frag'][seq_type + '_end']) - 1 elif attr == seq_type + '_end': value = max(value, prev['frag'][seq_type + '_start']) setattr(frag, attr, value) # strand and frame setattr require the full parsed values # to be set first for seq_type in ('hit', 'query'): # try to set hit and query frame frame = self._get_frag_frame(frag, seq_type, prev['frag']) setattr(frag, '%s_frame' % seq_type, frame) # try to set hit and query strand strand = self._get_frag_strand(frag, seq_type, prev['frag']) setattr(frag, '%s_strand' % seq_type, strand) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit and append to temp hit container if hit_state # says we're not at the same hit or at a new query if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline().strip()
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. Arguments: root_hit_elem -- Element object of the Iteration_hits tag. query_id -- String of QueryResult ID of this Hit """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' hsps = [hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id)] hit = Hit(hsps) hit.description = hit_desc # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith('Internal pipeline') or line.startswith('>>')) # start parsing the hsp block hit_list = [] while True: if self.line.startswith('Internal pipeline'): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith('>>') hid, hdesc = self.line[len('>> '):].split(' ', 1) # read through the hsp table header and move one more line self._read_until(lambda line: line.startswith(' --- ------ ----- --------') or \ line.startswith(' [No individual domains')) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if self.line.startswith(' [No targets detected that satisfy') or \ self.line.startswith(' [No individual domains') or \ self.line.startswith('Internal pipeline statistics summary:') or \ self.line.startswith(' Alignments for each domain:') or \ self.line.startswith('>>'): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(' ') if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == '!' hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get('program') == 'hmmscan': # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get('program') in ['hmmsearch', 'phmmer']: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(' Alignments for each domain:'): self._parse_aln_block(hid, hit.hsps)
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 # initial value dummies qres_state = None file_state = None prev_qid = None cur, prev = None, None # container for Hit objects, used to create QueryResult hit_list = [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid # only parse the result row if it's not EOF # NOTE: we are not parsing the extra '#' lines appended to the end # of hmmer31b1 tabular results since storing them in qresult # objects means we can not do a single-pass parsing if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] else: file_state = state_EOF # mock value for cur_qid, since we have nothing to parse cur_qid = None if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME if prev is not None: # since domain tab formats only have 1 Hit per line # we always create HSPFragment, HSP, and Hit per line prev_hid = prev['hit']['id'] # create fragment and HSP and set their attributes frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) # create Hit and set its attributes hit = Hit([hsp]) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _create_hits(self, hit_attrs, qid, qdesc): """Parses a HMMER3 hsp block, beginning with the hsp table.""" # read through until the beginning of the hsp block self._read_until(lambda line: line.startswith("Internal pipeline") or line.startswith(">>")) # start parsing the hsp block hit_list = [] while True: if self.line.startswith("Internal pipeline"): # by this time we should've emptied the hit attr list assert len(hit_attrs) == 0 return hit_list assert self.line.startswith(">>") hid, hdesc = self.line[len(">> ") :].split(" ", 1) # read through the hsp table header and move one more line self._read_until( lambda line: line.startswith(" --- ------ ----- --------") or line.startswith(" [No individual domains") ) self.line = read_forward(self.handle) # parse the hsp table for the current hit hsp_list = [] while True: # break out of hsp parsing if there are no hits, it's the last hsp # or it's the start of a new hit if ( self.line.startswith(" [No targets detected that satisfy") or self.line.startswith(" [No individual domains") or self.line.startswith("Internal pipeline statistics summary:") or self.line.startswith(" Alignments for each domain:") or self.line.startswith(">>") ): hit_attr = hit_attrs.pop(0) hit = Hit(hsp_list) for attr, value in hit_attr.items(): setattr(hit, attr, value) if not hit: hit.query_description = qdesc hit_list.append(hit) break parsed = [x for x in self.line.strip().split(" ") if x] assert len(parsed) == 16 # parsed column order: # index, is_included, bitscore, bias, evalue_cond, evalue # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito, # envfrom, envto, acc_avg frag = HSPFragment(hid, qid) # HMMER3 alphabets are always protein alphabets frag.alphabet = generic_protein # depending on whether the program is hmmsearch, hmmscan, or phmmer # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to} # for hmmscan, hit is the hmm profile, query is the sequence if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[6]) - 1 frag.hit_end = int(parsed[7]) frag.query_start = int(parsed[9]) - 1 frag.query_end = int(parsed[10]) elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones frag.hit_start = int(parsed[9]) - 1 frag.hit_end = int(parsed[10]) frag.query_start = int(parsed[6]) - 1 frag.query_end = int(parsed[7]) # strand is always 0, since HMMER now only handles protein frag.hit_strand = frag.query_strand = 0 hsp = HSP([frag]) hsp.domain_index = int(parsed[0]) hsp.is_included = parsed[1] == "!" hsp.bitscore = float(parsed[2]) hsp.bias = float(parsed[3]) hsp.evalue_cond = float(parsed[4]) hsp.evalue = float(parsed[5]) if self._meta.get("program") == "hmmscan": # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[8] hsp.query_endtype = parsed[11] elif self._meta.get("program") in ["hmmsearch", "phmmer"]: # adjust 'from' and 'to' coordinates to 0-based ones hsp.hit_endtype = parsed[11] hsp.query_endtype = parsed[8] # adjust 'from' and 'to' coordinates to 0-based ones hsp.env_start = int(parsed[12]) - 1 hsp.env_end = int(parsed[13]) hsp.env_endtype = parsed[14] hsp.acc_avg = float(parsed[15]) hsp_list.append(hsp) self.line = read_forward(self.handle) # parse the hsp alignments if self.line.startswith(" Alignments for each domain:"): self._parse_aln_block(hid, hit.hsps)
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummy values qres_state = None file_state = None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: cur = self._parse_row() cur_qid = cur['qname'] cur_hid = cur['tname'] else: file_state = state_EOF # mock values, since we have nothing to parse cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: # create fragment and HSP and set their attributes hsp = _create_hsp(prev_hid, prev_qid, prev) hsp_list.append(hsp) if hit_state == state_HIT_NEW: # create Hit and set its attributes hit = Hit(hsp_list) hit.seq_len = prev['tsize'] hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: qresult.absorb(hit) qresult.seq_len = prev['qsize'] yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_hit(self, root_hit_elem, query_id): """Generator that transforms Iteration_hits XML elements into Hit objects. Arguments: root_hit_elem -- Element object of the Iteration_hits tag. query_id -- String of QueryResult ID of this Hit """ # Hit level processing # Hits are stored in the Iteration_hits tag, with the following # DTD # <!ELEMENT Hit ( # Hit_num, # Hit_id, # Hit_def, # Hit_accession, # Hit_len, # Hit_hsps?)> # feed the loop below an empty list so iteration still works if root_hit_elem is None: root_hit_elem = [] for hit_elem in root_hit_elem: # create empty hit object hit_id = hit_elem.findtext('Hit_id') hit_desc = hit_elem.findtext('Hit_def') # handle blast searches against databases with Blast's IDs if hit_id.startswith('gnl|BL_ORD_ID|'): blast_hit_id = hit_id id_desc = hit_desc.split(' ', 1) hit_id = id_desc[0] try: hit_desc = id_desc[1] except IndexError: hit_desc = '' else: blast_hit_id = '' hsps = [ hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'), query_id, hit_id) ] hit = Hit(hsps) hit.description = hit_desc # blast_hit_id is only set if the hit ID is Blast-generated hit._blast_id = blast_hit_id for key, val_info in _ELEM_HIT.items(): value = hit_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(hit, val_info[0], value) # delete element after we finish parsing it hit_elem.clear() yield hit
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values, for every line after the 1st if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] else: file_state = state_EOF # mock ID values since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # start creating objects after the first line (i.e. prev is filled) if prev is not None: # each line is basically an HSP with one HSPFragment frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit object when we've finished parsing all its hsps # i.e. when hit state is state_HIT_NEW if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): # state values state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummies qres_state, hit_state = None, None file_state = None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] # if the file has c4 alignments, use that as the alignment mark if self.has_c4_alignment: self._ALN_MARK = 'C4 Alignment:' while True: self.read_until(lambda line: line.startswith(self._ALN_MARK)) if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: assert self.line.startswith(self._ALN_MARK), self.line # create temp dicts for storing parsed values header = {'qresult': {}, 'hit': {}, 'hsp': {}} # if the file has c4 alignments, try to parse the header if self.has_c4_alignment: self.read_until( lambda line: line.strip().startswith('Query:')) header = self._parse_alignment_header() # parse the block contents cur = self.parse_alignment_block(header) cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] elif not self.line or self.line.startswith('-- completed '): file_state = state_EOF cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new query if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) hsp_list.append(hsp) if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: # not using append since Exonerate may separate the # same hit if it has different strands qresult.absorb(hit) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult if file_state == state_EOF: break hit_list = [] # only readline() here if we're not parsing C4 alignments # C4 alignments readline() is handled by its parse_alignment_block # function if not self.has_c4_alignment: self.line = self.handle.readline()