def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()): """ "fd_blastall_output" is the output fd of the blast process (input for this method) "temporalOutputFile" is a file where all the input of fd_blastall_output is saved "return_only_ids" is used to store only ids, not complete blast results "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids """ blast_results = [] query_re = re.compile("Query=\s*(.+)\s*") # IT WAS INCORRECT.... DID IT AFFECT ANY RESULT??? letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)") sbjct_re = re.compile("^>([\w\d\_\.\|]+)") length_re = re.compile("Length \= ([\,\d]+)") score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)") identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)") positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)") gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)") intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$") sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$") # Temporal variables to store information to read exact alignment alignment_start_index = None capture_matching_line = False sbjct_matching = False alignment_summary = [] blastResult_obj = BlastResult(method="blastall",mode="F") def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct): """ """ # BUG AMB ELS GAPS!!!!!!!!!! alignment_line = "".join(alignment_line_list) aligned_query = "".join(aligned_query) aligned_sbjct = "".join(aligned_sbjct) if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct): print aligned_query print alignment_line print aligned_sbjct raise ValueError("Alignments must be of the same size") query_gaps = 0 sbjct_gaps = 0 for x in xrange(len(alignment_line)): value = alignment_line[x] if aligned_query[x]=="-": query_gaps += 1 if aligned_sbjct[x]=="-": sbjct_gaps += 1 if value == " ": continue else: if value != "+": blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) else: blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps) blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps) #print blastResult_obj.query_similar_match_list #print blastResult_obj.query_exact_match_list #print blastResult_obj.sbjct_exact_match_list for line in fd_blastall_output: if temporalOutputFile_fd: temporalOutputFile_fd.write(line) if capture_matching_line: alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length]) capture_matching_line = False continue m = query_re.search(line) if m: sequenceID_A = m.group(1) blastResult_obj.sequenceID_A = sequenceID_A m = letters_re.search(line) if m: blastResult_obj.query_length = int(m.group(1).replace(",",'')) sequenceID_B_search = sbjct_re.search(line) if sequenceID_B_search: # New sequenceID_B: if blastResult_obj.e_value is not None: if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(copy.copy(blastResult_obj)) alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj.reset() blastResult_obj.sequenceID_B = sequenceID_B_search.group(1) m = length_re.search(line) if m: blastResult_obj.sbjct_length = int(m.group(1).replace(",",'')) if re.search("^Matrix",line): # Query finished if blastResult_obj.e_value is not None: if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(blastResult_obj) alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj = BlastResult(method="blastall",mode="F") else: get_evalue = score_re.search(line) if get_evalue: # New hit found if blastResult_obj.e_value is not None: ## Check if there were other hits before if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B: parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct) if return_only_ids: blast_results.append(blastResult_obj.sequenceID_B) else: if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs: blast_results.append(copy.copy(blastResult_obj)) blastResult_obj.reset() alignment_start_index = None alignment_summary = [] aligned_query = [] aligned_sbjct = [] blastResult_obj.set_evalue(get_evalue.group(3)) blastResult_obj.score_bits = str(get_evalue.group(1)) blastResult_obj.score = str(get_evalue.group(2)) get_identities = identities_re.search(line) if get_identities: blastResult_obj.align_length = get_identities.group(1) blastResult_obj.identities= str(get_identities.group(2)) get_positives = positives_re.search(line) if get_positives: blastResult_obj.positives = str(get_positives.group(1)) get_gaps = gaps_re.search(line) if get_gaps: blastResult_obj.gaps = str(get_gaps.group(1)) get_intervals_query = intervals_query_re.search(line) if get_intervals_query: if blastResult_obj.query_start is None: blastResult_obj.query_start = int(get_intervals_query.group(1)) alignment_start_index = line.index(get_intervals_query.group(2)) subalignment_length = len(get_intervals_query.group(2)) capture_matching_line = True sbjct_matching = True aligned_query.append(get_intervals_query.group(2)) blastResult_obj.query_end = int(get_intervals_query.group(3)) get_intervals_Sbjct = sbjct_intervals_re.search(line) if get_intervals_Sbjct and sbjct_matching: if blastResult_obj.sbjct_start is None: blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1)) blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3)) aligned_sbjct.append(get_intervals_Sbjct.group(2)) sbjct_matching = False return blast_results
def parse_bl2seq_output(sequenceID_A, sequenceID_B, bl2seq_output=None, fd_output_file=None): score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)") identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)") positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)") gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)") intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$") sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$") #intervals_query_re = re.compile("Query:\s+(\d+)\s+\S+\s+(\d+)$") #sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+\S+\s+(\d+)$") letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)") length_re = re.compile("Length\s+\=\s+([\d\,]+)") if fd_output_file is None: fd_output_file = sys.stdout if bl2seq_output is None: return else: # Split the output in lines bl2seq_lines = bl2seq_output.split("\n") blastResult_obj = BlastResult(method="bl2seq",mode="F") blastResult_obj.sequenceID_A = sequenceID_A blastResult_obj.sequenceID_B = sequenceID_B for line in bl2seq_lines: if re.search("Lambda",line): # Useful information is finished # Appending the last result if blastResult_obj.e_value is not None: if blastResult_obj.e_value < 0.1: fd_output_file.write(str(blastResult_obj)) # blastResult_obj.write(fd_output_file) blastResult_obj.reset() else: get_evalue = score_re.search(line) if get_evalue: # New hit found if blastResult_obj.e_value is not None: if blastResult_obj.e_value < 0.1: #blastResult_obj.write(fd_output_file) fd_output_file.write(str(blastResult_obj)) blastResult_obj.reset() blastResult_obj.set_evalue(get_evalue.group(3)) blastResult_obj.score= get_evalue.group(2) blastResult_obj.score_bits = get_evalue.group(1) m = letters_re.search(line) if m: blastResult_obj.query_length = int(m.group(1).replace(',','')) m = length_re.search(line) if m: blastResult_obj.sbjct_length = int(m.group(1).replace(',','')) get_identities = identities_re.search(line) if get_identities: blastResult_obj.align_length = get_identities.group(1) blastResult_obj.identities= get_identities.group(2) get_positives = positives_re.search(line) if get_positives: blastResult_obj.positives = get_positives.group(1) get_gaps = gaps_re.search(line) if get_gaps: blastResult_obj.gaps = get_gaps.group(1) get_intervals_query = intervals_query_re.search(line) if get_intervals_query: if blastResult_obj.query_start is None: blastResult_obj.query_start = get_intervals_query.group(1) blastResult_obj.query_end = get_intervals_query.group(2) get_intervals_Sbjct = sbjct_intervals_re.search(line) if get_intervals_Sbjct: if blastResult_obj.sbjct_start is None: blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1) blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)