示例#1
0
    def parse(query_sequence, blast_output_file, self_hit, hitid_format):
        '''
        Processes a blast xml formated output into a {BlastResult} object.

        @param:    query_sequence
        @pdef:     sequence of the query protein/nucleotide.
        @ptype:    {String}

        @param:    blast_output_file
        @pdef:     output file from BLAST.
        @ptype:    {String}

        @param:   self_hit
        @pdef:     when _True_ if the query is found in the database, it is
                   retrieved.
        @pdefault: _False_
        @ptype:    {Boolean}

        @param:    hitid_format
        @pdef:     format of the name of the hit. If given a wrong option,
                   it defaults to 'single'
        @pdefault: 'single'
        @poptions: 'single' -> first word of the name,
                   'double' -> first two words of the hit name,
                   'all'    -> all the text in the hit name
        @ptype:    {String}

        @raises: {BlastError} if there are problems while parsing the XML file.
        @returns: {BlastResult}
        '''
        f = File(blast_output_file)
        s = BeautifulSoup(f.read())

        h = BlastHeader(version    = str(s.find('blastoutput_version').string),
                        matrix     = str(s.find('parameters_matrix').string),
                        gap_open   = int(s.find('parameters_gap-open').string),
                        gap_extend = int(s.find('parameters_gap-extend').string),
                        database   = str(s.find('blastoutput_db').string),
                        self_hit   = self_hit)
        b = BlastResult(query_name     = str(s.find('blastoutput_query-def').string),
                        query_sequence = query_sequence,
                        header         = h)

        SBIg.alert('debug', BlastParser(), b.str_blast_header())

        error_bool = False
        error_str  = []
        for iteration in s.find_all('iteration'):
            iternum = int(iteration.find('iteration_iter-num').string)
            for hit in iteration.find_all('hit'):
                hit_name  = BlastParser.hit_name(str(hit.find('hit_def').string), hitid_format)
                hit_lenth = int(hit.find("hit_len").string)
                for subhit in hit.find_all("hsp"):
                    data = BlastParser.parse_subhit(subhit)
                    r = BlastHit(hit            = [hit_name, hit_lenth],
                                 sequences      = [data['qs'], data['hs'], data['sc']],
                                 sequence_inits = [data['qp'], data['hp']],
                                 iteration      = iternum,
                                 stats          = [data['hi'], data['h+'],
                                                   data['hg'], data['ev']])
                    if not BlastParser.same_query_hit_names(b.query, hit_name, self_hit):
                        dbug_info = 'Added hit {0} in iteration {1}'
                        SBIg.alert('debug', BlastParser(), dbug_info.format(hit_name, r.iteration))
                        b.add_hit(r)
                        if not r.are_segments_ok:
                            error_bool = True
                            error_str.append("Check the alignment's fragmentation")
                            error_str.append("for the query %s with %s\n".format(b.query, hit_name))
                            error_str.append("{0}\n".format(r))
        b.set_last_iteration()
        if error_bool:
            SBIg.warn(BlastParser(), error_str)
            be = BlastError()
            raise be.parse_error()
        return b