示例#1
0
def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()):
    """
    "fd_blastall_output" is the output fd of the blast process (input for this method)
    
    "temporalOutputFile" is a file where all the input of fd_blastall_output is saved

    "return_only_ids" is used to store only ids, not complete blast results

    "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids
    """

    blast_results = []

    query_re = re.compile("Query=\s*(.+)\s*")    # IT WAS INCORRECT.... DID IT AFFECT ANY RESULT???
    letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)")
    sbjct_re = re.compile("^>([\w\d\_\.\|]+)")
    length_re = re.compile("Length \= ([\,\d]+)")
    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")

    # Temporal variables to store information to read exact alignment
    alignment_start_index = None
    capture_matching_line = False
    sbjct_matching = False
    alignment_summary = []
    
    blastResult_obj = BlastResult(method="blastall",mode="F")

    def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct):
        """
        
        """
        
        # BUG AMB ELS GAPS!!!!!!!!!!

        alignment_line = "".join(alignment_line_list)
        aligned_query = "".join(aligned_query)
        aligned_sbjct = "".join(aligned_sbjct)

        if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
            print aligned_query
            print alignment_line
            print aligned_sbjct
            raise ValueError("Alignments must be of the same size")

        query_gaps = 0
        sbjct_gaps = 0

        for x in xrange(len(alignment_line)):
            value = alignment_line[x]
            if aligned_query[x]=="-":
                query_gaps += 1
            if aligned_sbjct[x]=="-":
                sbjct_gaps += 1
            if value == " ":
                continue
            else:
                if value != "+":
                    blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                else:
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)

        #print blastResult_obj.query_similar_match_list
        #print blastResult_obj.query_exact_match_list
        #print blastResult_obj.sbjct_exact_match_list

    for line in fd_blastall_output:

        if temporalOutputFile_fd:
            temporalOutputFile_fd.write(line)

        if capture_matching_line:
            alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
            capture_matching_line = False
            continue
                                                 
        m = query_re.search(line)
        if m:
            sequenceID_A = m.group(1)
            blastResult_obj.sequenceID_A = sequenceID_A

        m = letters_re.search(line)
        if m:
            blastResult_obj.query_length = int(m.group(1).replace(",",''))

        sequenceID_B_search = sbjct_re.search(line)
        if sequenceID_B_search:

            # New sequenceID_B:
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(copy.copy(blastResult_obj))

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []

            blastResult_obj.reset()
            blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)

        m = length_re.search(line)
        if m:
            blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

        if re.search("^Matrix",line):
            # Query finished
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(blastResult_obj)

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []
            
            blastResult_obj = BlastResult(method="blastall",mode="F")

        else:
            get_evalue = score_re.search(line)
            if get_evalue:
                # New hit found
                if blastResult_obj.e_value is not None:    ## Check if there were other hits before
                    if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                        parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                        if return_only_ids:
                            blast_results.append(blastResult_obj.sequenceID_B)
                        else:
                            if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                                blast_results.append(copy.copy(blastResult_obj))

                blastResult_obj.reset()

                alignment_start_index = None
                alignment_summary = []
                aligned_query = []
                aligned_sbjct = []

                blastResult_obj.set_evalue(get_evalue.group(3))
                blastResult_obj.score_bits = str(get_evalue.group(1))
                blastResult_obj.score = str(get_evalue.group(2))

            get_identities = identities_re.search(line)

            if get_identities:
                blastResult_obj.align_length = get_identities.group(1)
                blastResult_obj.identities= str(get_identities.group(2))

            get_positives = positives_re.search(line)

            if get_positives:
                blastResult_obj.positives = str(get_positives.group(1))

            get_gaps = gaps_re.search(line)
            if get_gaps:
                blastResult_obj.gaps = str(get_gaps.group(1))

            get_intervals_query = intervals_query_re.search(line)

            if get_intervals_query:
                if blastResult_obj.query_start is None:
                    blastResult_obj.query_start = int(get_intervals_query.group(1))
                    alignment_start_index = line.index(get_intervals_query.group(2))
                subalignment_length = len(get_intervals_query.group(2))
                capture_matching_line = True
                sbjct_matching = True
                aligned_query.append(get_intervals_query.group(2))
                blastResult_obj.query_end = int(get_intervals_query.group(3))

            get_intervals_Sbjct = sbjct_intervals_re.search(line)

            if get_intervals_Sbjct and sbjct_matching:
                if blastResult_obj.sbjct_start is None:
                    blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                aligned_sbjct.append(get_intervals_Sbjct.group(2))
                sbjct_matching = False

    return blast_results
示例#2
0
def parse_bl2seq_output(sequenceID_A, sequenceID_B, bl2seq_output=None, fd_output_file=None):

    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$")
    #intervals_query_re = re.compile("Query:\s+(\d+)\s+\S+\s+(\d+)$")
    #sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+\S+\s+(\d+)$")
    letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)")
    length_re = re.compile("Length\s+\=\s+([\d\,]+)")

    if fd_output_file is None:
        fd_output_file = sys.stdout

    if bl2seq_output is None:
        return
    else:

        # Split the output in lines
        bl2seq_lines = bl2seq_output.split("\n")

        blastResult_obj = BlastResult(method="bl2seq",mode="F")
        blastResult_obj.sequenceID_A = sequenceID_A
        blastResult_obj.sequenceID_B = sequenceID_B

        for line in bl2seq_lines:
            if re.search("Lambda",line):
                # Useful information is finished
                # Appending the last result
                if blastResult_obj.e_value is not None:
                    if blastResult_obj.e_value < 0.1:
                        fd_output_file.write(str(blastResult_obj))
                        # blastResult_obj.write(fd_output_file)
                        blastResult_obj.reset()
                    
            else:
                get_evalue = score_re.search(line)
                if get_evalue:
                    # New hit found
                    if blastResult_obj.e_value is not None:
                        if blastResult_obj.e_value < 0.1:
                            #blastResult_obj.write(fd_output_file)
                            fd_output_file.write(str(blastResult_obj))
                            blastResult_obj.reset()

                    blastResult_obj.set_evalue(get_evalue.group(3))
                    blastResult_obj.score= get_evalue.group(2)
                    blastResult_obj.score_bits = get_evalue.group(1)

                m = letters_re.search(line)
                if m:
                    blastResult_obj.query_length = int(m.group(1).replace(',',''))

                m = length_re.search(line)
                if m:
                    blastResult_obj.sbjct_length = int(m.group(1).replace(',',''))

                get_identities = identities_re.search(line)
                if get_identities:
                    blastResult_obj.align_length = get_identities.group(1)
                    blastResult_obj.identities= get_identities.group(2)

                get_positives = positives_re.search(line)
                if get_positives:
                    blastResult_obj.positives = get_positives.group(1)

                get_gaps = gaps_re.search(line)
                if get_gaps:
                    blastResult_obj.gaps = get_gaps.group(1)

                get_intervals_query = intervals_query_re.search(line)
                if get_intervals_query:
                    if blastResult_obj.query_start is None:
                        blastResult_obj.query_start = get_intervals_query.group(1)
                    blastResult_obj.query_end = get_intervals_query.group(2)

                get_intervals_Sbjct = sbjct_intervals_re.search(line)
                if get_intervals_Sbjct:
                    if blastResult_obj.sbjct_start is None:
                        blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1)
                    blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)