示例#1
0
    def execute_query(self, query_file, blast_output_file = None,
                      work_directory = os.getcwd()):
        '''
        Execute BLAST given a query sequence.

        @param:    query_file
        @pdef:     Fasta file with the query sequence.
        @pdefault: 'QuerySequence'
        @ptype:    {String} or {File} or {Fasta}

        @param:    blast_output_file
        @pdef:     name of the temporary BLAST output file.
        @pdefault: query_file.prefix + job.pid + .blast.xml.out
        @ptype:    {String}

        @param:    work_directory
        @pdef:     Directory to which the temporary files will be created.
        @pdefault: Current working directory.
        @ptype:    {String}

        @raises: {AttributeError} if query_file is multi-fasta.
        @raises: {BlastError} in BLAST execution or output parsing errors.

        @returns: {BlastResult}
        '''
        if isinstance(query_file, basestring) or isinstance(query_file, File):
            newFasta = Fasta(fasta_file = query_file)
        elif isinstance(query_file, Fasta):
            newFasta = query_file

        if newFasta.is_multifasta:
            msg = 'Blasts can only be executed one at a time due to XML output restrictions.'
            raise AttributeError(msg)

        # All the sequence is unknown, it will crash blast
        newFasta.load()
        query_sequence = newFasta.sequence
        if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0:
            SBIg.warn(self, 'Created an empty BlastResult.')
            return BlastResult(query_name     = query_sequence.id,
                               query_sequence = query_sequence.sequence)

        Path.mkdir(work_directory)
        file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())])
        file_prefixes = os.path.join(work_directory, file_prefixes)
        tmp_output    = file_prefixes + ".blast.xml.out"

        tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file)

        self._execute(input_file = newFasta, output_file = tmp_output)

        blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output)

        self._clean([tmp_output, ])

        return blast_result
示例#2
0
    def parse(query_sequence, blast_output_file, self_hit, hitid_format):
        '''
        Processes a blast xml formated output into a {BlastResult} object.

        @param:    query_sequence
        @pdef:     sequence of the query protein/nucleotide.
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     output file from BLAST.
        @ptype:    {String}

        @param:   self_hit
        @pdef:     when _True_ if the query is found in the database, it is
                   retrieved.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    hitid_format
        @pdef:     format of the name of the hit. If given a wrong option,
                   it defaults to 'single'
        @pdefault: 'single'
        @poptions: 'single' -> first word of the name,
                   'double' -> first two words of the hit name,
                   'all'    -> all the text in the hit name
        @ptype:    {String}

        @raises: {BlastError} if there are problems while parsing the XML file.
        @returns: {BlastResult}
        '''
        f = File(blast_output_file)
        s = BeautifulSoup(f.read())

        h = BlastHeader(version    = str(s.find('blastoutput_version').string),
                        matrix     = str(s.find('parameters_matrix').string),
                        gap_open   = int(s.find('parameters_gap-open').string),
                        gap_extend = int(s.find('parameters_gap-extend').string),
                        database   = str(s.find('blastoutput_db').string),
                        self_hit   = self_hit)
        b = BlastResult(query_name     = str(s.find('blastoutput_query-def').string),
                        query_sequence = query_sequence,
                        header         = h)

        SBIg.alert('debug', BlastParser(), b.str_blast_header())

        error_bool = False
        error_str  = []
        for iteration in s.find_all('iteration'):
            iternum = int(iteration.find('iteration_iter-num').string)
            for hit in iteration.find_all('hit'):
                hit_name  = BlastParser.hit_name(str(hit.find('hit_def').string), hitid_format)
                hit_lenth = int(hit.find("hit_len").string)
                for subhit in hit.find_all("hsp"):
                    data = BlastParser.parse_subhit(subhit)
                    r = BlastHit(hit            = [hit_name, hit_lenth],
                                 sequences      = [data['qs'], data['hs'], data['sc']],
                                 sequence_inits = [data['qp'], data['hp']],
                                 iteration      = iternum,
                                 stats          = [data['hi'], data['h+'],
                                                   data['hg'], data['ev']])
                    if not BlastParser.same_query_hit_names(b.query, hit_name, self_hit):
                        dbug_info = 'Added hit {0} in iteration {1}'
                        SBIg.alert('debug', BlastParser(), dbug_info.format(hit_name, r.iteration))
                        b.add_hit(r)
                        if not r.are_segments_ok:
                            error_bool = True
                            error_str.append("Check the alignment's fragmentation")
                            error_str.append("for the query %s with %s\n".format(b.query, hit_name))
                            error_str.append("{0}\n".format(r))
        b.set_last_iteration()
        if error_bool:
            SBIg.warn(BlastParser(), error_str)
            be = BlastError()
            raise be.parse_error()
        return b
示例#3
0
    def execute_query_seq(self, sequenceID = None, sequence          = None,
                          blast_input_file = None, blast_output_file = None,
                          work_directory   = os.getcwd()):
        '''
        Execute BLAST given a query sequence.

        @param:    sequenceID
        @pdef:     name of the query sequence.
        @pdefault: 'QuerySequence'
        @pclash:   If sequence is not provided, it assumes that the sequenceID
                   belongs to a protein in the database and, thus, it searches
                   for it. Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    sequence
        @pdef:     query sequence.
        @pdefault: _None_
        @pclash:   Either sequenceID or sequence needs to be provided.
        @ptype:    {String}

        @param:    blast_input_file
        @pdef:     name of the temporary fasta file to use as BLAST input.
        @pdefault: job.pid + clock + .tmp.fa
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     name of the temporary BLAST output file.
        @pdefault: job.pid + clock + .blast.xml.out
        @ptype:    {String}

        @param:    work_directory
        @pdef:     Directory to which the temporary files will be created.
        @pdefault: Current working directory.
        @ptype:    {String}

        @raises: {AttributeError} if neither sequenceID nor sequence are
                  provided or if sequenceID is a list of sequence names.
        @raises: {BlastError} in BLAST execution or output parsing errors.

        @returns: {BlastResult}
        '''
        if sequenceID is None and sequence is None:
            msg = 'Either a sequence or sequenceID is needed to perform the blast.'
            raise AttributeError(msg)

        if isinstance(sequenceID, (list, set, tuple)):
            msg = 'Blasts can only be executed one at a time due to XML output restrictions.'
            raise AttributeError(msg)

        sequenceID = 'QuerySequence' if sequenceID is None else sequenceID

        # Given only a code implies that the protein of interest is in the
        # database itself
        if sequence is None:
            grabbedSequence = self._database.retrieve(sequenceID)
            sequenceID      = grabbedSequence[0].id
            sequence        = grabbedSequence[0].sequence

        # All the sequence is unknown, it will crash blast
        if len(re.sub(r'[Xx]', '', sequence)) == 0:
            SBIg.warn(self, 'Created an empty BlastResult.')
            return BlastResult(query_name     = sequenceID,
                               query_sequence = sequence)

        Path.mkdir(work_directory)
        file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))])
        file_prefixes = os.path.join(work_directory, file_prefixes)
        tmp_input     = file_prefixes + ".tmp.fa"
        tmp_output    = file_prefixes + ".blast.xml.out"

        tmp_input  = tmp_input  if blast_input_file  is None else os.path.join(work_directory, blast_input_file)
        tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file)

        QueryFasta = Fasta.build(file_name = tmp_input, sequence_id = sequenceID,
                                 sequence  = sequence,  force       = True)

        self._execute(input_file = QueryFasta, output_file = tmp_output)

        blast_result = self._parse_blast(sequence, tmp_output)

        self._clean([tmp_input, tmp_output])

        return blast_result
示例#4
0
def parse_blastall_output(fd_blastall_output, temporalOutputFile_fd=None, return_only_ids=False, limit_to_sequenceIDs=sets.Set()):
    """
    "fd_blastall_output" is the output fd of the blast process (input for this method)
    
    "temporalOutputFile" is a file where all the input of fd_blastall_output is saved

    "return_only_ids" is used to store only ids, not complete blast results

    "limit_to_sequenceIDs" is used to filter blast parsing to only those sequenceids
    """

    blast_results = []

    query_re = re.compile("Query=\s*(.+)\s*")    # IT WAS INCORRECT.... DID IT AFFECT ANY RESULT???
    letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*\)")
    sbjct_re = re.compile("^>([\w\d\_\.\|]+)")
    length_re = re.compile("Length \= ([\,\d]+)")
    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")

    # Temporal variables to store information to read exact alignment
    alignment_start_index = None
    capture_matching_line = False
    sbjct_matching = False
    alignment_summary = []
    
    blastResult_obj = BlastResult(method="blastall",mode="F")

    def parse_alignment_line(alignment_line_list, blastResult_obj, aligned_query, aligned_sbjct):
        """
        
        """
        
        # BUG AMB ELS GAPS!!!!!!!!!!

        alignment_line = "".join(alignment_line_list)
        aligned_query = "".join(aligned_query)
        aligned_sbjct = "".join(aligned_sbjct)

        if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
            print aligned_query
            print alignment_line
            print aligned_sbjct
            raise ValueError("Alignments must be of the same size")

        query_gaps = 0
        sbjct_gaps = 0

        for x in xrange(len(alignment_line)):
            value = alignment_line[x]
            if aligned_query[x]=="-":
                query_gaps += 1
            if aligned_sbjct[x]=="-":
                sbjct_gaps += 1
            if value == " ":
                continue
            else:
                if value != "+":
                    blastResult_obj.query_exact_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_exact_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)
                else:
                    blastResult_obj.query_similar_match_list.append(x+blastResult_obj.query_start-query_gaps)
                    blastResult_obj.sbjct_similar_match_list.append(x+blastResult_obj.sbjct_start-sbjct_gaps)

        #print blastResult_obj.query_similar_match_list
        #print blastResult_obj.query_exact_match_list
        #print blastResult_obj.sbjct_exact_match_list

    for line in fd_blastall_output:

        if temporalOutputFile_fd:
            temporalOutputFile_fd.write(line)

        if capture_matching_line:
            alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
            capture_matching_line = False
            continue
                                                 
        m = query_re.search(line)
        if m:
            sequenceID_A = m.group(1)
            blastResult_obj.sequenceID_A = sequenceID_A

        m = letters_re.search(line)
        if m:
            blastResult_obj.query_length = int(m.group(1).replace(",",''))

        sequenceID_B_search = sbjct_re.search(line)
        if sequenceID_B_search:

            # New sequenceID_B:
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(copy.copy(blastResult_obj))

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []

            blastResult_obj.reset()
            blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)

        m = length_re.search(line)
        if m:
            blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

        if re.search("^Matrix",line):
            # Query finished
            if blastResult_obj.e_value is not None:
                if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                    parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                    if return_only_ids:
                        blast_results.append(blastResult_obj.sequenceID_B)
                    else:
                        if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                            blast_results.append(blastResult_obj)

            alignment_start_index = None
            alignment_summary = []
            aligned_query = []
            aligned_sbjct = []
            
            blastResult_obj = BlastResult(method="blastall",mode="F")

        else:
            get_evalue = score_re.search(line)
            if get_evalue:
                # New hit found
                if blastResult_obj.e_value is not None:    ## Check if there were other hits before
                    if blastResult_obj.sequenceID_A != blastResult_obj.sequenceID_B:
                        parse_alignment_line(alignment_summary, blastResult_obj, aligned_query, aligned_sbjct)
                        if return_only_ids:
                            blast_results.append(blastResult_obj.sequenceID_B)
                        else:
                            if len(limit_to_sequenceIDs)==0 or blastResult_obj.sequenceID_B in limit_to_sequenceIDs:
                                blast_results.append(copy.copy(blastResult_obj))

                blastResult_obj.reset()

                alignment_start_index = None
                alignment_summary = []
                aligned_query = []
                aligned_sbjct = []

                blastResult_obj.set_evalue(get_evalue.group(3))
                blastResult_obj.score_bits = str(get_evalue.group(1))
                blastResult_obj.score = str(get_evalue.group(2))

            get_identities = identities_re.search(line)

            if get_identities:
                blastResult_obj.align_length = get_identities.group(1)
                blastResult_obj.identities= str(get_identities.group(2))

            get_positives = positives_re.search(line)

            if get_positives:
                blastResult_obj.positives = str(get_positives.group(1))

            get_gaps = gaps_re.search(line)
            if get_gaps:
                blastResult_obj.gaps = str(get_gaps.group(1))

            get_intervals_query = intervals_query_re.search(line)

            if get_intervals_query:
                if blastResult_obj.query_start is None:
                    blastResult_obj.query_start = int(get_intervals_query.group(1))
                    alignment_start_index = line.index(get_intervals_query.group(2))
                subalignment_length = len(get_intervals_query.group(2))
                capture_matching_line = True
                sbjct_matching = True
                aligned_query.append(get_intervals_query.group(2))
                blastResult_obj.query_end = int(get_intervals_query.group(3))

            get_intervals_Sbjct = sbjct_intervals_re.search(line)

            if get_intervals_Sbjct and sbjct_matching:
                if blastResult_obj.sbjct_start is None:
                    blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                aligned_sbjct.append(get_intervals_Sbjct.group(2))
                sbjct_matching = False

    return blast_results
示例#5
0
    def next(self):

        # Temporal variables to store information to read exact alignment
        alignment_start_index = None
        capture_matching_line = False
        sbjct_matching = False
        alignment_summary = []
        aligned_query = []
        aligned_sbjct = []
        
        #blastResult_obj = BlastResult(method="blastall",mode="F")
        #blastResult_obj.sequenceID_A = self.current_sequenceID_A
        #blastResult_obj.query_length = self.current_query_length
        #blastResult_obj.sequenceID_B = self.current_sequenceID_B
        #blastResult_obj.sbjct_length = self.current_sbjct_length

        for line in self.fd:
            #print line

            if capture_matching_line:
                alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
                capture_matching_line = False
                continue
                                                 
            m = BlastallParserIterator.query_re.search(line)

            if m:
                self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                self.current_blastResult_obj.sequenceID_A = m.group(1)

            m = BlastallParserIterator.letters_re.search(line)
            if m:
                self.current_blastResult_obj.query_length = int(m.group(1).replace(",",''))

            sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line)

            if sequenceID_B_search:
                #print line
                # New sequenceID_B:
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                        self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                        self.current_blastResult_obj.query_length = t.query_length
                        return t
                else:
                    self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                                
            m = BlastallParserIterator.length_re.search(line)
            if m:
                self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

            if re.search("^Matrix",line):
                # Query finished
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        return t

            else:
                get_evalue = BlastallParserIterator.score_re.search(line)
                if not get_evalue:
                	get_evalue = BlastallParserIterator.score_option2_re.search(line)
                if get_evalue:
                    # New hit found
                    if self.current_blastResult_obj.e_value is not None:    ## Check if there were other hits before
                        if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                            if self.parse_lines:
                                self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                            t = self.current_blastResult_obj
                            self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                            self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                            self.current_blastResult_obj.query_length = t.query_length
                            self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                            self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                            self.current_blastResult_obj.score = str(get_evalue.group(2))
                            self.current_blastResult_obj.sequenceID_B = t.sequenceID_B
                            self.current_blastResult_obj.sbjct_length = t.sbjct_length
                            return t

                    self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                    self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                    self.current_blastResult_obj.score = str(get_evalue.group(2))

                get_identities = BlastallParserIterator.identities_re.search(line)

                if get_identities:
                    self.current_blastResult_obj.align_length = get_identities.group(1)
                    self.current_blastResult_obj.identities= str(get_identities.group(2))

                get_positives = BlastallParserIterator.positives_re.search(line)

                if get_positives:
                    self.current_blastResult_obj.positives = str(get_positives.group(1))

                get_gaps = BlastallParserIterator.gaps_re.search(line)
                if get_gaps:
                    self.current_blastResult_obj.gaps = str(get_gaps.group(1))

                get_intervals_query = BlastallParserIterator.intervals_query_re.search(line)

                if get_intervals_query:
                    if self.current_blastResult_obj.query_start is None:
                        self.current_blastResult_obj.query_start = int(get_intervals_query.group(1))
                        alignment_start_index = line.index(get_intervals_query.group(2))
                    subalignment_length = len(get_intervals_query.group(2))
                    capture_matching_line = True
                    sbjct_matching = True
                    aligned_query.append(get_intervals_query.group(2))
                    self.current_blastResult_obj.query_end = int(get_intervals_query.group(3))

                get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line)

                if get_intervals_Sbjct and sbjct_matching:
                    if self.current_blastResult_obj.sbjct_start is None:
                        self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                    self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                    aligned_sbjct.append(get_intervals_Sbjct.group(2))
                    sbjct_matching = False

        raise StopIteration
示例#6
0
class BlastallParserIterator(object):
    """
    """

    query_re = re.compile("Query=\s*(.+)\s*")
    letters_re = re.compile("\(\s*([\,\d]+)\s*letters\s*")
    sbjct_re = re.compile(">([\w\d\_\.\|]+)")
    length_re = re.compile("Length \= ([\,\d]+)")
    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    score_option2_re = re.compile("Score\s+=\s+([\.\d]+)\s+\(([\.\d]+)\s+bits\)\,\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+(\S+)\s+(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+(\S+)\s+(\d+)$")

    def __init__(self, fd_blastall_output, parse_detailed_alignments=False):

        self.fd = fd_blastall_output
        
        #self.current_sequenceID_A = None
        #self.current_query_length = None
        #self.current_sequenceID_B = None
        #self.current_sbjct_length = None
        self.current_blastResult_obj = None
        self.parse_lines = parse_detailed_alignments

    def __iter__(self):
        return self


    def parse_alignment_line(self, alignment_line_list, aligned_query, aligned_sbjct):
        """
        """
      
 
        # BUG AMB ELS GAPS!!!!!!!!!!
        print "Parsing line"
        print alignment_line_list
        alignment_line = "".join(alignment_line_list)
        aligned_query = "".join(aligned_query)
        aligned_sbjct = "".join(aligned_sbjct)

        if len(alignment_line) != len(aligned_query) or len(aligned_query) != len(aligned_sbjct):
            print aligned_query
            print alignment_line
            print aligned_sbjct
            raise ValueError("Alignments must be of the same size")

        query_gaps = 0
        sbjct_gaps = 0

        for x in xrange(len(alignment_line)):
            value = alignment_line[x]
            if aligned_query[x]=="-":
                query_gaps += 1
            if aligned_sbjct[x]=="-":
                sbjct_gaps += 1
            if value == " ":
                continue
            else:
                if value != "+":
                    self.current_blastResult_obj.query_exact_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_exact_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
                    self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)
                else:
                    self.current_blastResult_obj.query_similar_match_list.append(x+self.current_blastResult_obj.query_start-query_gaps)
                    self.current_blastResult_obj.sbjct_similar_match_list.append(x+self.current_blastResult_obj.sbjct_start-sbjct_gaps)

    def next(self):

        # Temporal variables to store information to read exact alignment
        alignment_start_index = None
        capture_matching_line = False
        sbjct_matching = False
        alignment_summary = []
        aligned_query = []
        aligned_sbjct = []
        
        #blastResult_obj = BlastResult(method="blastall",mode="F")
        #blastResult_obj.sequenceID_A = self.current_sequenceID_A
        #blastResult_obj.query_length = self.current_query_length
        #blastResult_obj.sequenceID_B = self.current_sequenceID_B
        #blastResult_obj.sbjct_length = self.current_sbjct_length

        for line in self.fd:
            #print line

            if capture_matching_line:
                alignment_summary.append(line[alignment_start_index:alignment_start_index+subalignment_length])
                capture_matching_line = False
                continue
                                                 
            m = BlastallParserIterator.query_re.search(line)

            if m:
                self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                self.current_blastResult_obj.sequenceID_A = m.group(1)

            m = BlastallParserIterator.letters_re.search(line)
            if m:
                self.current_blastResult_obj.query_length = int(m.group(1).replace(",",''))

            sequenceID_B_search = BlastallParserIterator.sbjct_re.search(line)

            if sequenceID_B_search:
                #print line
                # New sequenceID_B:
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                        self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                        self.current_blastResult_obj.query_length = t.query_length
                        return t
                else:
                    self.current_blastResult_obj.sequenceID_B = sequenceID_B_search.group(1)
                                
            m = BlastallParserIterator.length_re.search(line)
            if m:
                self.current_blastResult_obj.sbjct_length = int(m.group(1).replace(",",''))

            if re.search("^Matrix",line):
                # Query finished
                if self.current_blastResult_obj.e_value is not None:
                    if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                        if self.parse_lines:
                            self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                        t = self.current_blastResult_obj
                        self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                        return t

            else:
                get_evalue = BlastallParserIterator.score_re.search(line)
                if not get_evalue:
                	get_evalue = BlastallParserIterator.score_option2_re.search(line)
                if get_evalue:
                    # New hit found
                    if self.current_blastResult_obj.e_value is not None:    ## Check if there were other hits before
                        if self.current_blastResult_obj.sequenceID_A != self.current_blastResult_obj.sequenceID_B:
                            if self.parse_lines:
                                self.parse_alignment_line(alignment_summary, aligned_query, aligned_sbjct)
                            t = self.current_blastResult_obj
                            self.current_blastResult_obj = BlastResult(method="blastall",mode="F")
                            self.current_blastResult_obj.sequenceID_A = t.sequenceID_A
                            self.current_blastResult_obj.query_length = t.query_length
                            self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                            self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                            self.current_blastResult_obj.score = str(get_evalue.group(2))
                            self.current_blastResult_obj.sequenceID_B = t.sequenceID_B
                            self.current_blastResult_obj.sbjct_length = t.sbjct_length
                            return t

                    self.current_blastResult_obj.set_evalue(get_evalue.group(3))
                    self.current_blastResult_obj.score_bits = str(get_evalue.group(1))
                    self.current_blastResult_obj.score = str(get_evalue.group(2))

                get_identities = BlastallParserIterator.identities_re.search(line)

                if get_identities:
                    self.current_blastResult_obj.align_length = get_identities.group(1)
                    self.current_blastResult_obj.identities= str(get_identities.group(2))

                get_positives = BlastallParserIterator.positives_re.search(line)

                if get_positives:
                    self.current_blastResult_obj.positives = str(get_positives.group(1))

                get_gaps = BlastallParserIterator.gaps_re.search(line)
                if get_gaps:
                    self.current_blastResult_obj.gaps = str(get_gaps.group(1))

                get_intervals_query = BlastallParserIterator.intervals_query_re.search(line)

                if get_intervals_query:
                    if self.current_blastResult_obj.query_start is None:
                        self.current_blastResult_obj.query_start = int(get_intervals_query.group(1))
                        alignment_start_index = line.index(get_intervals_query.group(2))
                    subalignment_length = len(get_intervals_query.group(2))
                    capture_matching_line = True
                    sbjct_matching = True
                    aligned_query.append(get_intervals_query.group(2))
                    self.current_blastResult_obj.query_end = int(get_intervals_query.group(3))

                get_intervals_Sbjct = BlastallParserIterator.sbjct_intervals_re.search(line)

                if get_intervals_Sbjct and sbjct_matching:
                    if self.current_blastResult_obj.sbjct_start is None:
                        self.current_blastResult_obj.sbjct_start = int(get_intervals_Sbjct.group(1))
                    self.current_blastResult_obj.sbjct_end = int(get_intervals_Sbjct.group(3))
                    aligned_sbjct.append(get_intervals_Sbjct.group(2))
                    sbjct_matching = False

        raise StopIteration
示例#7
0
def parse_bl2seq_output(sequenceID_A, sequenceID_B, bl2seq_output=None, fd_output_file=None):

    score_re = re.compile("Score\s+=\s+([\.\d]+)\s+bits\s+\((\d+)\),\s+Expect\s+=\s+([\d\.e\-]+)")
    identities_re = re.compile("Identities\s+=\s+\d+\/(\d+)\s+\((\d+)%\)")
    positives_re = re.compile("Positives\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    gaps_re = re.compile("Gaps\s+=\s+\d+\/\d+\s+\((\d+)%\)")
    intervals_query_re = re.compile("Query:\s+(\d+)\s+.+\s(\d+)$")
    sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+.+\s(\d+)$")
    #intervals_query_re = re.compile("Query:\s+(\d+)\s+\S+\s+(\d+)$")
    #sbjct_intervals_re = re.compile("Sbjct:\s+(\d+)\s+\S+\s+(\d+)$")
    letters_re = re.compile("\(\s*([\d\,]+)\s*letters\s*\)")
    length_re = re.compile("Length\s+\=\s+([\d\,]+)")

    if fd_output_file is None:
        fd_output_file = sys.stdout

    if bl2seq_output is None:
        return
    else:

        # Split the output in lines
        bl2seq_lines = bl2seq_output.split("\n")

        blastResult_obj = BlastResult(method="bl2seq",mode="F")
        blastResult_obj.sequenceID_A = sequenceID_A
        blastResult_obj.sequenceID_B = sequenceID_B

        for line in bl2seq_lines:
            if re.search("Lambda",line):
                # Useful information is finished
                # Appending the last result
                if blastResult_obj.e_value is not None:
                    if blastResult_obj.e_value < 0.1:
                        fd_output_file.write(str(blastResult_obj))
                        # blastResult_obj.write(fd_output_file)
                        blastResult_obj.reset()
                    
            else:
                get_evalue = score_re.search(line)
                if get_evalue:
                    # New hit found
                    if blastResult_obj.e_value is not None:
                        if blastResult_obj.e_value < 0.1:
                            #blastResult_obj.write(fd_output_file)
                            fd_output_file.write(str(blastResult_obj))
                            blastResult_obj.reset()

                    blastResult_obj.set_evalue(get_evalue.group(3))
                    blastResult_obj.score= get_evalue.group(2)
                    blastResult_obj.score_bits = get_evalue.group(1)

                m = letters_re.search(line)
                if m:
                    blastResult_obj.query_length = int(m.group(1).replace(',',''))

                m = length_re.search(line)
                if m:
                    blastResult_obj.sbjct_length = int(m.group(1).replace(',',''))

                get_identities = identities_re.search(line)
                if get_identities:
                    blastResult_obj.align_length = get_identities.group(1)
                    blastResult_obj.identities= get_identities.group(2)

                get_positives = positives_re.search(line)
                if get_positives:
                    blastResult_obj.positives = get_positives.group(1)

                get_gaps = gaps_re.search(line)
                if get_gaps:
                    blastResult_obj.gaps = get_gaps.group(1)

                get_intervals_query = intervals_query_re.search(line)
                if get_intervals_query:
                    if blastResult_obj.query_start is None:
                        blastResult_obj.query_start = get_intervals_query.group(1)
                    blastResult_obj.query_end = get_intervals_query.group(2)

                get_intervals_Sbjct = sbjct_intervals_re.search(line)
                if get_intervals_Sbjct:
                    if blastResult_obj.sbjct_start is None:
                        blastResult_obj.sbjct_start = get_intervals_Sbjct.group(1)
                    blastResult_obj.sbjct_end = get_intervals_Sbjct.group(2)