def _parse_qresult(self): """Parse query results (PRIVATE).""" # parse the queries for event, qresult_elem in self.xml_iter: # </Iteration> marks the end of a single query # which means we can process it if event == 'end' and qresult_elem.tag == 'Iteration': # we'll use the following schema # <!ELEMENT Iteration ( # Iteration_iter-num, # Iteration_query-ID?, # Iteration_query-def?, # Iteration_query-len?, # Iteration_hits?, # Iteration_stat?, # Iteration_message?)> # assign query attributes with fallbacks query_id = qresult_elem.findtext('Iteration_query-ID') if query_id is None: query_id = self._fallback['id'] query_desc = qresult_elem.findtext('Iteration_query-def') if query_desc is None: query_desc = self._fallback['description'] query_len = qresult_elem.findtext('Iteration_query-len') if query_len is None: query_len = self._fallback['len'] blast_query_id = query_id # handle blast searches against databases with Blast's IDs # 'Query_' marks the beginning of a BLAST+-generated ID, # 'lcl|' marks the beginning of a BLAST legacy-generated ID if not self._use_raw_query_ids and \ (query_id.startswith('Query_') or query_id.startswith('lcl|')): # store the Blast-generated query ID id_desc = query_desc.split(' ', 1) query_id = id_desc[0] try: query_desc = id_desc[1] except IndexError: query_desc = '' hit_list, key_list = [], [] for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), query_id): if hit: # need to keep track of hit IDs, since there could be duplicates, if hit.id in key_list: warnings.warn( "Renaming hit ID %r to a BLAST-generated ID " "%r since the ID was already matched " "by your query %r. Your BLAST database " "may contain duplicate entries." % (hit.id, hit.blast_id, query_id), BiopythonParserWarning) # fallback to Blast-generated IDs, if the ID is already present # and restore the desc, too hit.description = '%s %s' % (hit.id, hit.description) hit.id = hit.blast_id # and change the hit_id of the HSPs contained for hsp in hit: hsp.hit_id = hit.blast_id else: key_list.append(hit.id) hit_list.append(hit) # create qresult and assign its attributes qresult = QueryResult(hit_list, query_id) qresult.description = query_desc qresult.seq_len = int(query_len) qresult.blast_id = blast_query_id for key, value in self._meta.items(): setattr(qresult, key, value) # statistics are stored in Iteration_stat's 'grandchildren' with the # following DTD # <!ELEMENT Statistics ( # Statistics_db-num, # Statistics_db-len, # Statistics_hsp-len, # Statistics_eff-space, # Statistics_kappa, # Statistics_lambda, # Statistics_entropy)> stat_iter_elem = qresult_elem.find('Iteration_stat') if stat_iter_elem is not None: stat_elem = stat_iter_elem.find('Statistics') for key, val_info in _ELEM_QRESULT_OPT.items(): value = stat_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(qresult, val_info[0], value) # delete element after we finish parsing it qresult_elem.clear() yield qresult
def _parse_qresult(self): """Parse query result (PRIVATE).""" # initial qresult value qresult = None hit_rows = [] # state values state_QRES_NEW = 1 state_QRES_HITTAB = 3 state_QRES_CONTENT = 5 state_QRES_END = 7 while True: # one line before the hit table if self.line.startswith("The best scores are:"): qres_state = state_QRES_HITTAB # the end of a query or the file altogether elif self.line.strip() == ">>>///" or not self.line: qres_state = state_QRES_END # the beginning of a new query elif not self.line.startswith(">>>") and ">>>" in self.line: qres_state = state_QRES_NEW # the beginning of the query info and its hits + hsps elif self.line.startswith(">>>") and not \ self.line.strip() == ">>><<<": qres_state = state_QRES_CONTENT # default qres mark else: qres_state = None if qres_state is not None: if qres_state == state_QRES_HITTAB: # parse hit table if flag is set hit_rows = self.__parse_hit_table() elif qres_state == state_QRES_END: yield _set_qresult_hits(qresult, hit_rows) break elif qres_state == state_QRES_NEW: # if qresult is filled, yield it first if qresult is not None: yield _set_qresult_hits(qresult, hit_rows) regx = re.search(_RE_ID_DESC_SEQLEN, self.line) query_id = regx.group(1) seq_len = regx.group(3) desc = regx.group(2) qresult = QueryResult(id=query_id) qresult.seq_len = int(seq_len) # get target from the next line self.line = self.handle.readline() qresult.target = [x for x in self.line.split(" ") if x][1].strip() if desc is not None: qresult.description = desc # set values from preamble for key, value in self._preamble.items(): setattr(qresult, key, value) elif qres_state == state_QRES_CONTENT: assert self.line[3:].startswith(qresult.id), self.line for hit, strand in self._parse_hit(query_id): # HACK: re-set desc, for hsp hit and query description hit.description = hit.description hit.query_description = qresult.description # if hit is not in qresult, append it if hit.id not in qresult: qresult.append(hit) # otherwise, it might be the same hit with a different strand else: # make sure strand is different and then append hsp to # existing hit for hsp in hit.hsps: assert strand != hsp.query_strand qresult[hit.id].append(hsp) self.line = self.handle.readline()
def _parse_qresult(self): """Parses query results.""" # parse the queries for event, qresult_elem in self.xml_iter: # </Iteration> marks the end of a single query # which means we can process it if event == 'end' and qresult_elem.tag == 'Iteration': # we'll use the following schema # <!ELEMENT Iteration ( # Iteration_iter-num, # Iteration_query-ID?, # Iteration_query-def?, # Iteration_query-len?, # Iteration_hits?, # Iteration_stat?, # Iteration_message?)> # assign query attributes with fallbacks query_id = qresult_elem.findtext('Iteration_query-ID') if query_id is None: query_id = self._fallback['id'] query_desc = qresult_elem.findtext('Iteration_query-def') if query_desc is None: query_desc = self._fallback['description'] query_len = qresult_elem.findtext('Iteration_query-len') if query_len is None: query_len = self._fallback['len'] # handle blast searches against databases with Blast's IDs # 'Query_' marks the beginning of a BLAST+-generated ID, # 'lcl|' marks the beginning of a BLAST legacy-generated ID if query_id.startswith('Query_') or query_id.startswith('lcl|'): # store the Blast-generated query ID blast_query_id = query_id id_desc = query_desc.split(' ', 1) query_id = id_desc[0] try: query_desc = id_desc[1] except IndexError: query_desc = '' else: blast_query_id = '' hit_list, key_list = [], [] for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), query_id): if hit: # need to keep track of hit IDs, since there could be duplicates, if hit.id in key_list: warnings.warn("Adding hit with BLAST-generated ID " "%r since hit ID %r is already present " "in query %r. Your BLAST database may contain " "duplicate entries." % (hit._blast_id, hit.id, query_id), BiopythonParserWarning) # fallback to Blast-generated IDs, if the ID is already present # and restore the desc, too hit.description = '%s %s' % (hit.id, hit.description) hit.id = hit._blast_id # and change the hit_id of the HSPs contained for hsp in hit: hsp.hit_id = hit._blast_id else: key_list.append(hit.id) hit_list.append(hit) # create qresult and assign its attributes qresult = QueryResult(hit_list, query_id) qresult.description = query_desc qresult.seq_len = int(query_len) qresult._blast_id = blast_query_id for key, value in self._meta.items(): setattr(qresult, key, value) # statistics are stored in Iteration_stat's 'grandchildren' with the # following DTD # <!ELEMENT Statistics ( # Statistics_db-num, # Statistics_db-len, # Statistics_hsp-len, # Statistics_eff-space, # Statistics_kappa, # Statistics_lambda, # Statistics_entropy)> stat_iter_elem = qresult_elem.find('Iteration_stat') if stat_iter_elem is not None: stat_elem = stat_iter_elem.find('Statistics') for key, val_info in _ELEM_QRESULT_OPT.items(): value = stat_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(qresult, val_info[0], value) # delete element after we finish parsing it qresult_elem.clear() yield qresult
def _parse_qresult(self): # initial qresult value qresult = None hit_rows = [] # state values state_QRES_NEW = 1 state_QRES_HITTAB = 3 state_QRES_CONTENT = 5 state_QRES_END = 7 while True: # one line before the hit table if self.line.startswith('The best scores are:'): qres_state = state_QRES_HITTAB # the end of a query or the file altogether elif self.line.strip() == '>>>///' or not self.line: qres_state = state_QRES_END # the beginning of a new query elif not self.line.startswith('>>>') and '>>>' in self.line: qres_state = state_QRES_NEW # the beginning of the query info and its hits + hsps elif self.line.startswith('>>>') and not \ self.line.strip() == '>>><<<': qres_state = state_QRES_CONTENT # default qres mark else: qres_state = None if qres_state is not None: if qres_state == state_QRES_HITTAB: # parse hit table if flag is set hit_rows = self.__parse_hit_table() elif qres_state == state_QRES_END: yield _set_qresult_hits(qresult, hit_rows) break elif qres_state == state_QRES_NEW: # if qresult is filled, yield it first if qresult is not None: yield _set_qresult_hits(qresult, hit_rows) regx = re.search(_RE_ID_DESC_SEQLEN, self.line) query_id = regx.group(1) seq_len = regx.group(3) desc = regx.group(2) qresult = QueryResult(id=query_id) qresult.seq_len = int(seq_len) # get target from the next line self.line = self.handle.readline() qresult.target = [x for x in self.line.split(' ') if x][1].strip() if desc is not None: qresult.description = desc # set values from preamble for key, value in self._preamble.items(): setattr(qresult, key, value) elif qres_state == state_QRES_CONTENT: assert self.line[3:].startswith(qresult.id), self.line for hit, strand in self._parse_hit(query_id): # HACK: re-set desc, for hsp hit and query description hit.description = hit.description hit.query_description = qresult.description # if hit is not in qresult, append it if hit.id not in qresult: qresult.append(hit) # otherwise, it might be the same hit with a different strand else: # make sure strand is different and then append hsp to # existing hit for hsp in hit.hsps: assert strand != hsp.query_strand qresult[hit.id].append(hsp) self.line = self.handle.readline()
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult