示例#1
0
class BlastallParserIterator(object):
    """
    """

    query_re = re.compile("Query=\s*(.+)\s*")
    letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*")
    sbjct_re = re.compile(">([\w\d\_\.\|]+)")
    length_re = re.compile("Length \= ([\,\d]+)")
    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    score_option2_re = re.compile("Score\s+=\s+([\.\d]+)\s+\(([\.\d]+)\s+bits\)\,\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")

    def __init__(self, fd_blastall_output, parse_detailed_alignments=False):

        self.fd = fd_blastall_output
        
        #self.current_sequenceID_A = None
        #self.current_query_length = None
        #self.current_sequenceID_B = None
        #self.current_sbjct_length = None
        self.current_blastResult_obj = None
        self.parse_lines = parse_detailed_alignments

    def __iter__(self):
        return self


    def parse_alignment_line(self, alignment_line_list, aligned_query, aligned_sbjct):
        """
        """
      
 
        # BUG AMB ELS GAPS!!!!!!!!!!
        print "Parsing line"
        print alignment_line_list
        alignment_line = "".join(alignment_line_list)
        aligned_query = "".join(aligned_query)
        aligned_sbjct = "".join(aligned_sbjct)

        if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
            print aligned_query
            print alignment_line
            print aligned_sbjct
            raise ValueError("Alignments must be of the same size")

        query_gaps = 0
        sbjct_gaps = 0

        for x in xrange(len(alignment_line)):
            value = alignment_line[x]
            if aligned_query[x]=="-":
                query_gaps += 1
            if aligned_sbjct[x]=="-":
                sbjct_gaps += 1
            if value == " ":
                continue
            else:
                if value != "+":
                    self.current_blastResult_obj.query_exact_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_exact_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
                    self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
                else:
                    self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)

    def next(self):

        # Temporal variables to store information to read exact alignment
        alignment_start_index = None
        capture_matching_line = False
        sbjct_matching = False
        alignment_summary = []
        aligned_query = []
        aligned_sbjct = []
        
        #blastResult_obj = BlastResult(method="blastall",mode="F")
        #blastResult_obj.sequenceID_A = self.current_sequenceID_A
        #blastResult_obj.query_length = self.current_query_length
        #blastResult_obj.sequenceID_B = self.current_sequenceID_B
        #blastResult_obj.sbjct_length = self.current_sbjct_length

        for line in self.fd:
            #print line

            if capture_matching_line:
                alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
                capture_matching_line = False
                continue
                                                 
            m = BlastallParserIterator.query_re.search(line)

            if m:
                self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                self.current_blastResult_obj.sequenceID_A = m.group(1)

            m = BlastallParserIterator.letters_re.search(line)
            if m:
                self.current_blastResult_obj.query_length = int(m.group(1).replace(",",''))

            sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line)

            if sequenceID_B_search:
                #print line
                # New sequenceID_B:
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                        self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                        self.current_blastResult_obj.query_length = t.query_length
                        return t
                else:
                    self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                                
            m = BlastallParserIterator.length_re.search(line)
            if m:
                self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

            if re.search("^Matrix",line):
                # Query finished
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        return t

            else:
                get_evalue = BlastallParserIterator.score_re.search(line)
                if not get_evalue:
                	get_evalue = BlastallParserIterator.score_option2_re.search(line)
                if get_evalue:
                    # New hit found
                    if self.current_blastResult_obj.e_value is not None:    ## Check if there were other hits before
                        if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                            if self.parse_lines:
                                self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                            t = self.current_blastResult_obj
                            self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                            self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                            self.current_blastResult_obj.query_length = t.query_length
                            self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                            self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                            self.current_blastResult_obj.score = str(get_evalue.group(2))
                            self.current_blastResult_obj.sequenceID_B = t.sequenceID_B
                            self.current_blastResult_obj.sbjct_length = t.sbjct_length
                            return t

                    self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                    self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                    self.current_blastResult_obj.score = str(get_evalue.group(2))

                get_identities = BlastallParserIterator.identities_re.search(line)

                if get_identities:
                    self.current_blastResult_obj.align_length = get_identities.group(1)
                    self.current_blastResult_obj.identities= str(get_identities.group(2))

                get_positives = BlastallParserIterator.positives_re.search(line)

                if get_positives:
                    self.current_blastResult_obj.positives = str(get_positives.group(1))

                get_gaps = BlastallParserIterator.gaps_re.search(line)
                if get_gaps:
                    self.current_blastResult_obj.gaps = str(get_gaps.group(1))

                get_intervals_query = BlastallParserIterator.intervals_query_re.search(line)

                if get_intervals_query:
                    if self.current_blastResult_obj.query_start is None:
                        self.current_blastResult_obj.query_start = int(get_intervals_query.group(1))
                        alignment_start_index = line.index(get_intervals_query.group(2))
                    subalignment_length = len(get_intervals_query.group(2))
                    capture_matching_line = True
                    sbjct_matching = True
                    aligned_query.append(get_intervals_query.group(2))
                    self.current_blastResult_obj.query_end = int(get_intervals_query.group(3))

                get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line)

                if get_intervals_Sbjct and sbjct_matching:
                    if self.current_blastResult_obj.sbjct_start is None:
                        self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                    self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                    aligned_sbjct.append(get_intervals_Sbjct.group(2))
                    sbjct_matching = False

        raise StopIteration
示例#2
0
def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()):
    """
    "fd_blastall_output" is the output fd of the blast process (input for this method)
    
    "temporalOutputFile" is a file where all the input of fd_blastall_output is saved

    "return_only_ids" is used to store only ids, not complete blast results

    "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids
    """

    blast_results = []

    query_re = re.compile("Query=\s*(.+)\s*")    # IT WAS INCORRECT.... DID IT AFFECT ANY RESULT???
    letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)")
    sbjct_re = re.compile("^>([\w\d\_\.\|]+)")
    length_re = re.compile("Length \= ([\,\d]+)")
    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")

    # Temporal variables to store information to read exact alignment
    alignment_start_index = None
    capture_matching_line = False
    sbjct_matching = False
    alignment_summary = []
    
    blastResult_obj = BlastResult(method="blastall",mode="F")

    def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct):
        """
        
        """
        
        # BUG AMB ELS GAPS!!!!!!!!!!

        alignment_line = "".join(alignment_line_list)
        aligned_query = "".join(aligned_query)
        aligned_sbjct = "".join(aligned_sbjct)

        if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
            print aligned_query
            print alignment_line
            print aligned_sbjct
            raise ValueError("Alignments must be of the same size")

        query_gaps = 0
        sbjct_gaps = 0

        for x in xrange(len(alignment_line)):
            value = alignment_line[x]
            if aligned_query[x]=="-":
                query_gaps += 1
            if aligned_sbjct[x]=="-":
                sbjct_gaps += 1
            if value == " ":
                continue
            else:
                if value != "+":
                    blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                else:
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)

        #print blastResult_obj.query_similar_match_list
        #print blastResult_obj.query_exact_match_list
        #print blastResult_obj.sbjct_exact_match_list

    for line in fd_blastall_output:

        if temporalOutputFile_fd:
            temporalOutputFile_fd.write(line)

        if capture_matching_line:
            alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
            capture_matching_line = False
            continue
                                                 
        m = query_re.search(line)
        if m:
            sequenceID_A = m.group(1)
            blastResult_obj.sequenceID_A = sequenceID_A

        m = letters_re.search(line)
        if m:
            blastResult_obj.query_length = int(m.group(1).replace(",",''))

        sequenceID_B_search = sbjct_re.search(line)
        if sequenceID_B_search:

            # New sequenceID_B:
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(copy.copy(blastResult_obj))

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []

            blastResult_obj.reset()
            blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)

        m = length_re.search(line)
        if m:
            blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

        if re.search("^Matrix",line):
            # Query finished
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(blastResult_obj)

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []
            
            blastResult_obj = BlastResult(method="blastall",mode="F")

        else:
            get_evalue = score_re.search(line)
            if get_evalue:
                # New hit found
                if blastResult_obj.e_value is not None:    ## Check if there were other hits before
                    if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                        parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                        if return_only_ids:
                            blast_results.append(blastResult_obj.sequenceID_B)
                        else:
                            if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                                blast_results.append(copy.copy(blastResult_obj))

                blastResult_obj.reset()

                alignment_start_index = None
                alignment_summary = []
                aligned_query = []
                aligned_sbjct = []

                blastResult_obj.set_evalue(get_evalue.group(3))
                blastResult_obj.score_bits = str(get_evalue.group(1))
                blastResult_obj.score = str(get_evalue.group(2))

            get_identities = identities_re.search(line)

            if get_identities:
                blastResult_obj.align_length = get_identities.group(1)
                blastResult_obj.identities= str(get_identities.group(2))

            get_positives = positives_re.search(line)

            if get_positives:
                blastResult_obj.positives = str(get_positives.group(1))

            get_gaps = gaps_re.search(line)
            if get_gaps:
                blastResult_obj.gaps = str(get_gaps.group(1))

            get_intervals_query = intervals_query_re.search(line)

            if get_intervals_query:
                if blastResult_obj.query_start is None:
                    blastResult_obj.query_start = int(get_intervals_query.group(1))
                    alignment_start_index = line.index(get_intervals_query.group(2))
                subalignment_length = len(get_intervals_query.group(2))
                capture_matching_line = True
                sbjct_matching = True
                aligned_query.append(get_intervals_query.group(2))
                blastResult_obj.query_end = int(get_intervals_query.group(3))

            get_intervals_Sbjct = sbjct_intervals_re.search(line)

            if get_intervals_Sbjct and sbjct_matching:
                if blastResult_obj.sbjct_start is None:
                    blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                aligned_sbjct.append(get_intervals_Sbjct.group(2))
                sbjct_matching = False

    return blast_results
示例#3
0
def parse_bl2seq_output(sequenceID_A, sequenceID_B, bl2seq_output=None, fd_output_file=None):

    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$")
    #intervals_query_re = re.compile("Query:\s+(\d+)\s+\S+\s+(\d+)$")
    #sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+\S+\s+(\d+)$")
    letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)")
    length_re = re.compile("Length\s+\=\s+([\d\,]+)")

    if fd_output_file is None:
        fd_output_file = sys.stdout

    if bl2seq_output is None:
        return
    else:

        # Split the output in lines
        bl2seq_lines = bl2seq_output.split("\n")

        blastResult_obj = BlastResult(method="bl2seq",mode="F")
        blastResult_obj.sequenceID_A = sequenceID_A
        blastResult_obj.sequenceID_B = sequenceID_B

        for line in bl2seq_lines:
            if re.search("Lambda",line):
                # Useful information is finished
                # Appending the last result
                if blastResult_obj.e_value is not None:
                    if blastResult_obj.e_value < 0.1:
                        fd_output_file.write(str(blastResult_obj))
                        # blastResult_obj.write(fd_output_file)
                        blastResult_obj.reset()
                    
            else:
                get_evalue = score_re.search(line)
                if get_evalue:
                    # New hit found
                    if blastResult_obj.e_value is not None:
                        if blastResult_obj.e_value < 0.1:
                            #blastResult_obj.write(fd_output_file)
                            fd_output_file.write(str(blastResult_obj))
                            blastResult_obj.reset()

                    blastResult_obj.set_evalue(get_evalue.group(3))
                    blastResult_obj.score= get_evalue.group(2)
                    blastResult_obj.score_bits = get_evalue.group(1)

                m = letters_re.search(line)
                if m:
                    blastResult_obj.query_length = int(m.group(1).replace(',',''))

                m = length_re.search(line)
                if m:
                    blastResult_obj.sbjct_length = int(m.group(1).replace(',',''))

                get_identities = identities_re.search(line)
                if get_identities:
                    blastResult_obj.align_length = get_identities.group(1)
                    blastResult_obj.identities= get_identities.group(2)

                get_positives = positives_re.search(line)
                if get_positives:
                    blastResult_obj.positives = get_positives.group(1)

                get_gaps = gaps_re.search(line)
                if get_gaps:
                    blastResult_obj.gaps = get_gaps.group(1)

                get_intervals_query = intervals_query_re.search(line)
                if get_intervals_query:
                    if blastResult_obj.query_start is None:
                        blastResult_obj.query_start = get_intervals_query.group(1)
                    blastResult_obj.query_end = get_intervals_query.group(2)

                get_intervals_Sbjct = sbjct_intervals_re.search(line)
                if get_intervals_Sbjct:
                    if blastResult_obj.sbjct_start is None:
                        blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1)
                    blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)