def execute_query(self, query_file, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: query_file @pdef: Fasta file with the query sequence. @pdefault: 'QuerySequence' @ptype: {String} or {File} or {Fasta} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: query_file.prefix + job.pid + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if query_file is multi-fasta. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if isinstance(query_file, basestring) or isinstance(query_file, File): newFasta = Fasta(fasta_file = query_file) elif isinstance(query_file, Fasta): newFasta = query_file if newFasta.is_multifasta: msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) # All the sequence is unknown, it will crash blast newFasta.load() query_sequence = newFasta.sequence if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = query_sequence.id, query_sequence = query_sequence.sequence) Path.mkdir(work_directory) file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_output = file_prefixes + ".blast.xml.out" tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) self._execute(input_file = newFasta, output_file = tmp_output) blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output) self._clean([tmp_output, ]) return blast_result
def parse(query_sequence, blast_output_file, self_hit, hitid_format): ''' Processes a blast xml formated output into a {BlastResult} object. @param: query_sequence @pdef: sequence of the query protein/nucleotide. @ptype: {String} @param: blast_output_file @pdef: output file from BLAST. @ptype: {String} @param: self_hit @pdef: when _True_ if the query is found in the database, it is retrieved. @pdefault: _False_ @ptype: {Boolean} @param: hitid_format @pdef: format of the name of the hit. If given a wrong option, it defaults to 'single' @pdefault: 'single' @poptions: 'single' -> first word of the name, 'double' -> first two words of the hit name, 'all' -> all the text in the hit name @ptype: {String} @raises: {BlastError} if there are problems while parsing the XML file. @returns: {BlastResult} ''' f = File(blast_output_file) s = BeautifulSoup(f.read()) h = BlastHeader(version = str(s.find('blastoutput_version').string), matrix = str(s.find('parameters_matrix').string), gap_open = int(s.find('parameters_gap-open').string), gap_extend = int(s.find('parameters_gap-extend').string), database = str(s.find('blastoutput_db').string), self_hit = self_hit) b = BlastResult(query_name = str(s.find('blastoutput_query-def').string), query_sequence = query_sequence, header = h) SBIg.alert('debug', BlastParser(), b.str_blast_header()) error_bool = False error_str = [] for iteration in s.find_all('iteration'): iternum = int(iteration.find('iteration_iter-num').string) for hit in iteration.find_all('hit'): hit_name = BlastParser.hit_name(str(hit.find('hit_def').string), hitid_format) hit_lenth = int(hit.find("hit_len").string) for subhit in hit.find_all("hsp"): data = BlastParser.parse_subhit(subhit) r = BlastHit(hit = [hit_name, hit_lenth], sequences = [data['qs'], data['hs'], data['sc']], sequence_inits = [data['qp'], data['hp']], iteration = iternum, stats = [data['hi'], data['h+'], data['hg'], data['ev']]) if not BlastParser.same_query_hit_names(b.query, hit_name, self_hit): dbug_info = 'Added hit {0} in iteration {1}' SBIg.alert('debug', BlastParser(), dbug_info.format(hit_name, r.iteration)) b.add_hit(r) if not r.are_segments_ok: error_bool = True error_str.append("Check the alignment's fragmentation") error_str.append("for the query %s with %s\n".format(b.query, hit_name)) error_str.append("{0}\n".format(r)) b.set_last_iteration() if error_bool: SBIg.warn(BlastParser(), error_str) be = BlastError() raise be.parse_error() return b
def execute_query_seq(self, sequenceID = None, sequence = None, blast_input_file = None, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: sequenceID @pdef: name of the query sequence. @pdefault: 'QuerySequence' @pclash: If sequence is not provided, it assumes that the sequenceID belongs to a protein in the database and, thus, it searches for it. Either sequenceID or sequence needs to be provided. @ptype: {String} @param: sequence @pdef: query sequence. @pdefault: _None_ @pclash: Either sequenceID or sequence needs to be provided. @ptype: {String} @param: blast_input_file @pdef: name of the temporary fasta file to use as BLAST input. @pdefault: job.pid + clock + .tmp.fa @ptype: {String} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: job.pid + clock + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if neither sequenceID nor sequence are provided or if sequenceID is a list of sequence names. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if sequenceID is None and sequence is None: msg = 'Either a sequence or sequenceID is needed to perform the blast.' raise AttributeError(msg) if isinstance(sequenceID, (list, set, tuple)): msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) sequenceID = 'QuerySequence' if sequenceID is None else sequenceID # Given only a code implies that the protein of interest is in the # database itself if sequence is None: grabbedSequence = self._database.retrieve(sequenceID) sequenceID = grabbedSequence[0].id sequence = grabbedSequence[0].sequence # All the sequence is unknown, it will crash blast if len(re.sub(r'[Xx]', '', sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = sequenceID, query_sequence = sequence) Path.mkdir(work_directory) file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_input = file_prefixes + ".tmp.fa" tmp_output = file_prefixes + ".blast.xml.out" tmp_input = tmp_input if blast_input_file is None else os.path.join(work_directory, blast_input_file) tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) QueryFasta = Fasta.build(file_name = tmp_input, sequence_id = sequenceID, sequence = sequence, force = True) self._execute(input_file = QueryFasta, output_file = tmp_output) blast_result = self._parse_blast(sequence, tmp_output) self._clean([tmp_input, tmp_output]) return blast_result
def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()): """ "fd_blastall_output" is the output fd of the blast process (input for this method) "temporalOutputFile" is a file where all the input of fd_blastall_output is saved "return_only_ids" is used to store only ids, not complete blast results "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids """ blast_results = [] query_re = re.compile("Query=\s*(.+)\s*") # IT WAS INCORRECT.... DID IT AFFECT ANY RESULT??? letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)") sbjct_re = re.compile("^>([\w\d\_\.\|]+)") length_re = re.compile("Length \= ([\,\d]+)") score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)") identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)") positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)") gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)") intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$") sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$") # Temporal variables to store information to read exact alignment alignment_start_index = None capture_matching_line = False sbjct_matching = False alignment_summary = [] blastResult_obj = BlastResult(method="blastall",mode="F") def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct): """ """ # BUG AMB ELS GAPS!!!!!!!!!! alignment_line = "".join(alignment_line_list) aligned_query = "".join(aligned_query) aligned_sbjct = "".join(aligned_sbjct) if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct): print aligned_query print alignment_line print aligned_sbjct raise ValueError("Alignments must be of the same size") query_gaps = 0 sbjct_gaps = 0 for x in xrange(len(alignment_line)): value = alignment_line[x] if aligned_query[x]=="-": query_gaps += 1 if aligned_sbjct[x]=="-": sbjct_gaps += 1 if value == " ": continue else: if value != "+": blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) else: blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) #print blastResult_obj.query_similar_match_list #print blastResult_obj.query_exact_match_list #print blastResult_obj.sbjct_exact_match_list for line in fd_blastall_output: if temporalOutputFile_fd: temporalOutputFile_fd.write(line) if capture_matching_line: alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length]) capture_matching_line = False continue m = query_re.search(line) if m: sequenceID_A = m.group(1) blastResult_obj.sequenceID_A = sequenceID_A m = letters_re.search(line) if m: blastResult_obj.query_length = int(m.group(1).replace(",",'')) sequenceID_B_search = sbjct_re.search(line) if sequenceID_B_search: # New sequenceID_B: if blastResult_obj.e_value is not None: if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(copy.copy(blastResult_obj)) alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj.reset() blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) m = length_re.search(line) if m: blastResult_obj.sbjct_length = int(m.group(1).replace(",",'')) if re.search("^Matrix",line): # Query finished if blastResult_obj.e_value is not None: if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(blastResult_obj) alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj = BlastResult(method="blastall",mode="F") else: get_evalue = score_re.search(line) if get_evalue: # New hit found if blastResult_obj.e_value is not None: ## Check if there were other hits before if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(copy.copy(blastResult_obj)) blastResult_obj.reset() alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj.set_evalue(get_evalue.group(3)) blastResult_obj.score_bits = str(get_evalue.group(1)) blastResult_obj.score = str(get_evalue.group(2)) get_identities = identities_re.search(line) if get_identities: blastResult_obj.align_length = get_identities.group(1) blastResult_obj.identities= str(get_identities.group(2)) get_positives = positives_re.search(line) if get_positives: blastResult_obj.positives = str(get_positives.group(1)) get_gaps = gaps_re.search(line) if get_gaps: blastResult_obj.gaps = str(get_gaps.group(1)) get_intervals_query = intervals_query_re.search(line) if get_intervals_query: if blastResult_obj.query_start is None: blastResult_obj.query_start = int(get_intervals_query.group(1)) alignment_start_index = line.index(get_intervals_query.group(2)) subalignment_length = len(get_intervals_query.group(2)) capture_matching_line = True sbjct_matching = True aligned_query.append(get_intervals_query.group(2)) blastResult_obj.query_end = int(get_intervals_query.group(3)) get_intervals_Sbjct = sbjct_intervals_re.search(line) if get_intervals_Sbjct and sbjct_matching: if blastResult_obj.sbjct_start is None: blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1)) blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3)) aligned_sbjct.append(get_intervals_Sbjct.group(2)) sbjct_matching = False return blast_results
def next(self): # Temporal variables to store information to read exact alignment alignment_start_index = None capture_matching_line = False sbjct_matching = False alignment_summary = [] aligned_query = [] aligned_sbjct = [] #blastResult_obj = BlastResult(method="blastall",mode="F") #blastResult_obj.sequenceID_A = self.current_sequenceID_A #blastResult_obj.query_length = self.current_query_length #blastResult_obj.sequenceID_B = self.current_sequenceID_B #blastResult_obj.sbjct_length = self.current_sbjct_length for line in self.fd: #print line if capture_matching_line: alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length]) capture_matching_line = False continue m = BlastallParserIterator.query_re.search(line) if m: self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = m.group(1) m = BlastallParserIterator.letters_re.search(line) if m: self.current_blastResult_obj.query_length = int(m.group(1).replace(",",'')) sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line) if sequenceID_B_search: #print line # New sequenceID_B: if self.current_blastResult_obj.e_value is not None: if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = t.sequenceID_A self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) self.current_blastResult_obj.query_length = t.query_length return t else: self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) m = BlastallParserIterator.length_re.search(line) if m: self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",'')) if re.search("^Matrix",line): # Query finished if self.current_blastResult_obj.e_value is not None: if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") return t else: get_evalue = BlastallParserIterator.score_re.search(line) if not get_evalue: get_evalue = BlastallParserIterator.score_option2_re.search(line) if get_evalue: # New hit found if self.current_blastResult_obj.e_value is not None: ## Check if there were other hits before if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = t.sequenceID_A self.current_blastResult_obj.query_length = t.query_length self.current_blastResult_obj.set_evalue(get_evalue.group(3)) self.current_blastResult_obj.score_bits = str(get_evalue.group(1)) self.current_blastResult_obj.score = str(get_evalue.group(2)) self.current_blastResult_obj.sequenceID_B = t.sequenceID_B self.current_blastResult_obj.sbjct_length = t.sbjct_length return t self.current_blastResult_obj.set_evalue(get_evalue.group(3)) self.current_blastResult_obj.score_bits = str(get_evalue.group(1)) self.current_blastResult_obj.score = str(get_evalue.group(2)) get_identities = BlastallParserIterator.identities_re.search(line) if get_identities: self.current_blastResult_obj.align_length = get_identities.group(1) self.current_blastResult_obj.identities= str(get_identities.group(2)) get_positives = BlastallParserIterator.positives_re.search(line) if get_positives: self.current_blastResult_obj.positives = str(get_positives.group(1)) get_gaps = BlastallParserIterator.gaps_re.search(line) if get_gaps: self.current_blastResult_obj.gaps = str(get_gaps.group(1)) get_intervals_query = BlastallParserIterator.intervals_query_re.search(line) if get_intervals_query: if self.current_blastResult_obj.query_start is None: self.current_blastResult_obj.query_start = int(get_intervals_query.group(1)) alignment_start_index = line.index(get_intervals_query.group(2)) subalignment_length = len(get_intervals_query.group(2)) capture_matching_line = True sbjct_matching = True aligned_query.append(get_intervals_query.group(2)) self.current_blastResult_obj.query_end = int(get_intervals_query.group(3)) get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line) if get_intervals_Sbjct and sbjct_matching: if self.current_blastResult_obj.sbjct_start is None: self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1)) self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3)) aligned_sbjct.append(get_intervals_Sbjct.group(2)) sbjct_matching = False raise StopIteration
class BlastallParserIterator(object): """ """ query_re = re.compile("Query=\s*(.+)\s*") letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*") sbjct_re = re.compile(">([\w\d\_\.\|]+)") length_re = re.compile("Length \= ([\,\d]+)") score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)") score_option2_re = re.compile("Score\s+=\s+([\.\d]+)\s+\(([\.\d]+)\s+bits\)\,\s+Expect\s+=\s+([\d\.e\-]+)") identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)") positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)") gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)") intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$") sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$") def __init__(self, fd_blastall_output, parse_detailed_alignments=False): self.fd = fd_blastall_output #self.current_sequenceID_A = None #self.current_query_length = None #self.current_sequenceID_B = None #self.current_sbjct_length = None self.current_blastResult_obj = None self.parse_lines = parse_detailed_alignments def __iter__(self): return self def parse_alignment_line(self, alignment_line_list, aligned_query, aligned_sbjct): """ """ # BUG AMB ELS GAPS!!!!!!!!!! print "Parsing line" print alignment_line_list alignment_line = "".join(alignment_line_list) aligned_query = "".join(aligned_query) aligned_sbjct = "".join(aligned_sbjct) if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct): print aligned_query print alignment_line print aligned_sbjct raise ValueError("Alignments must be of the same size") query_gaps = 0 sbjct_gaps = 0 for x in xrange(len(alignment_line)): value = alignment_line[x] if aligned_query[x]=="-": query_gaps += 1 if aligned_sbjct[x]=="-": sbjct_gaps += 1 if value == " ": continue else: if value != "+": self.current_blastResult_obj.query_exact_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps) self.current_blastResult_obj.sbjct_exact_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps) self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps) self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps) else: self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps) self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps) def next(self): # Temporal variables to store information to read exact alignment alignment_start_index = None capture_matching_line = False sbjct_matching = False alignment_summary = [] aligned_query = [] aligned_sbjct = [] #blastResult_obj = BlastResult(method="blastall",mode="F") #blastResult_obj.sequenceID_A = self.current_sequenceID_A #blastResult_obj.query_length = self.current_query_length #blastResult_obj.sequenceID_B = self.current_sequenceID_B #blastResult_obj.sbjct_length = self.current_sbjct_length for line in self.fd: #print line if capture_matching_line: alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length]) capture_matching_line = False continue m = BlastallParserIterator.query_re.search(line) if m: self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = m.group(1) m = BlastallParserIterator.letters_re.search(line) if m: self.current_blastResult_obj.query_length = int(m.group(1).replace(",",'')) sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line) if sequenceID_B_search: #print line # New sequenceID_B: if self.current_blastResult_obj.e_value is not None: if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = t.sequenceID_A self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) self.current_blastResult_obj.query_length = t.query_length return t else: self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) m = BlastallParserIterator.length_re.search(line) if m: self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",'')) if re.search("^Matrix",line): # Query finished if self.current_blastResult_obj.e_value is not None: if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") return t else: get_evalue = BlastallParserIterator.score_re.search(line) if not get_evalue: get_evalue = BlastallParserIterator.score_option2_re.search(line) if get_evalue: # New hit found if self.current_blastResult_obj.e_value is not None: ## Check if there were other hits before if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B: if self.parse_lines: self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct) t = self.current_blastResult_obj self.current_blastResult_obj = BlastResult(method="blastall",mode="F") self.current_blastResult_obj.sequenceID_A = t.sequenceID_A self.current_blastResult_obj.query_length = t.query_length self.current_blastResult_obj.set_evalue(get_evalue.group(3)) self.current_blastResult_obj.score_bits = str(get_evalue.group(1)) self.current_blastResult_obj.score = str(get_evalue.group(2)) self.current_blastResult_obj.sequenceID_B = t.sequenceID_B self.current_blastResult_obj.sbjct_length = t.sbjct_length return t self.current_blastResult_obj.set_evalue(get_evalue.group(3)) self.current_blastResult_obj.score_bits = str(get_evalue.group(1)) self.current_blastResult_obj.score = str(get_evalue.group(2)) get_identities = BlastallParserIterator.identities_re.search(line) if get_identities: self.current_blastResult_obj.align_length = get_identities.group(1) self.current_blastResult_obj.identities= str(get_identities.group(2)) get_positives = BlastallParserIterator.positives_re.search(line) if get_positives: self.current_blastResult_obj.positives = str(get_positives.group(1)) get_gaps = BlastallParserIterator.gaps_re.search(line) if get_gaps: self.current_blastResult_obj.gaps = str(get_gaps.group(1)) get_intervals_query = BlastallParserIterator.intervals_query_re.search(line) if get_intervals_query: if self.current_blastResult_obj.query_start is None: self.current_blastResult_obj.query_start = int(get_intervals_query.group(1)) alignment_start_index = line.index(get_intervals_query.group(2)) subalignment_length = len(get_intervals_query.group(2)) capture_matching_line = True sbjct_matching = True aligned_query.append(get_intervals_query.group(2)) self.current_blastResult_obj.query_end = int(get_intervals_query.group(3)) get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line) if get_intervals_Sbjct and sbjct_matching: if self.current_blastResult_obj.sbjct_start is None: self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1)) self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3)) aligned_sbjct.append(get_intervals_Sbjct.group(2)) sbjct_matching = False raise StopIteration
def parse_bl2seq_output(sequenceID_A, sequenceID_B, bl2seq_output=None, fd_output_file=None): score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)") identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)") positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)") gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)") intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$") sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$") #intervals_query_re = re.compile("Query:\s+(\d+)\s+\S+\s+(\d+)$") #sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+\S+\s+(\d+)$") letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)") length_re = re.compile("Length\s+\=\s+([\d\,]+)") if fd_output_file is None: fd_output_file = sys.stdout if bl2seq_output is None: return else: # Split the output in lines bl2seq_lines = bl2seq_output.split("\n") blastResult_obj = BlastResult(method="bl2seq",mode="F") blastResult_obj.sequenceID_A = sequenceID_A blastResult_obj.sequenceID_B = sequenceID_B for line in bl2seq_lines: if re.search("Lambda",line): # Useful information is finished # Appending the last result if blastResult_obj.e_value is not None: if blastResult_obj.e_value < 0.1: fd_output_file.write(str(blastResult_obj)) # blastResult_obj.write(fd_output_file) blastResult_obj.reset() else: get_evalue = score_re.search(line) if get_evalue: # New hit found if blastResult_obj.e_value is not None: if blastResult_obj.e_value < 0.1: #blastResult_obj.write(fd_output_file) fd_output_file.write(str(blastResult_obj)) blastResult_obj.reset() blastResult_obj.set_evalue(get_evalue.group(3)) blastResult_obj.score= get_evalue.group(2) blastResult_obj.score_bits = get_evalue.group(1) m = letters_re.search(line) if m: blastResult_obj.query_length = int(m.group(1).replace(',','')) m = length_re.search(line) if m: blastResult_obj.sbjct_length = int(m.group(1).replace(',','')) get_identities = identities_re.search(line) if get_identities: blastResult_obj.align_length = get_identities.group(1) blastResult_obj.identities= get_identities.group(2) get_positives = positives_re.search(line) if get_positives: blastResult_obj.positives = get_positives.group(1) get_gaps = gaps_re.search(line) if get_gaps: blastResult_obj.gaps = get_gaps.group(1) get_intervals_query = intervals_query_re.search(line) if get_intervals_query: if blastResult_obj.query_start is None: blastResult_obj.query_start = get_intervals_query.group(1) blastResult_obj.query_end = get_intervals_query.group(2) get_intervals_Sbjct = sbjct_intervals_re.search(line) if get_intervals_Sbjct: if blastResult_obj.sbjct_start is None: blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1) blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)