Пример #1
0
 def createHit(self, hsp_list):
     hit = Hit(hsp_list)
     hit.id_ = self.id_
     hit.evalue = self.evalue
     hit.bitscore = self.bitscore
     if self.description:
         hit.description = self.description
     hit.domain_obs_num = self.domain_obs_num
     return hit
Пример #2
0
def _set_qresult_hits(qresult, hit_rows=()):
    """Helper function for appending Hits without alignments into QueryResults."""
    for hit_row in hit_rows:
        hit_id, remainder = hit_row.split(' ', 1)
        # TODO: parse hit and hsp properties properly; by dealing with:
        #   - any character in the description (brackets, spaces, etc.)
        #   - possible [f] or [r] presence (for frame info)
        #   - possible presence of E2() column
        #   - possible incomplete hit_id due to column length limit
        # The current method only looks at the Hit ID, none of the things above
        if hit_id not in qresult:
            frag = HSPFragment(hit_id, qresult.id)
            hsp = HSP([frag])
            hit = Hit([hsp])
            qresult.append(hit)

    return qresult
Пример #3
0
    def _parse_hit(self, query_id):
        while True:
            self.line = self.handle.readline()
            if self.line.startswith('>>'):
                break

        state = _STATE_NONE
        strand = None
        hsp_list = []
        while True:
            peekline = self.handle.peekline()
            # yield hit if we've reached the start of a new query or
            # the end of the search
            if peekline.strip() in [">>><<<", ">>>///"] or \
                    (not peekline.startswith('>>>') and '>>>' in peekline):
                # append last parsed_hsp['hit']['seq'] line
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # process HSP alignment and coordinates
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                hit = Hit(hsp_list)
                hit.description = hit_desc
                hit.seq_len = seq_len
                yield hit, strand
                hsp_list = []
                break
            # yield hit and create a new one if we're still in the same query
            elif self.line.startswith('>>'):
                # try yielding,  if we have hsps
                if hsp_list:
                    _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                    hit = Hit(hsp_list)
                    hit.description = hit_desc
                    hit.seq_len = seq_len
                    yield hit, strand
                    hsp_list = []
                # try to get the hit id and desc, and handle cases without descs
                try:
                    hit_id, hit_desc = self.line[2:].strip().split(' ', 1)
                except ValueError:
                    hit_id = self.line[2:].strip().split(' ', 1)[0]
                    hit_desc = ''
                # create the HSP object for Hit
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set or reset the state to none
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # create and append a new HSP if line starts with '>--'
            elif self.line.startswith('>--'):
                # set seq attributes of previous hsp
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                # and create a new one
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set the state ~ none yet
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # this is either query or hit data in the HSP, depending on the state
            elif self.line.startswith('>'):
                if state == _STATE_NONE:
                    # make sure it's the correct query
                    assert query_id.startswith(self.line[1:].split(' ')[0]), \
                            "%r vs %r" % (query_id, self.line)
                    state = _STATE_QUERY_BLOCK
                    parsed_hsp['query']['seq'] = ''
                elif state == _STATE_QUERY_BLOCK:
                    # make sure it's the correct hit
                    assert hit_id.startswith(self.line[1:].split(' ')[0])
                    state = _STATE_HIT_BLOCK
                    parsed_hsp['hit']['seq'] = ''
            # check for conservation block
            elif self.line.startswith('; al_cons'):
                state = _STATE_CONS_BLOCK
                hsp.fragment.aln_annotation['similarity'] = ''
            elif self.line.startswith(';'):
                # Fasta outputs do not make a clear distinction between Hit
                # and HSPs, so we check the attribute names to determine
                # whether it belongs to a Hit or HSP
                regx = re.search(_RE_ATTR, self.line.strip())
                name = regx.group(1)
                value = regx.group(2)

                # for values before the '>...' query block
                if state == _STATE_NONE:
                    if name in _HSP_ATTR_MAP:
                        attr_name, caster = _HSP_ATTR_MAP[name]
                        if caster is not str:
                            value = caster(value)
                        if name in ['_ident', '_sim']:
                            value *= 100
                        setattr(hsp, attr_name, value)
                # otherwise, pool the values for processing later
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query'][name] = value
                elif state == _STATE_HIT_BLOCK:
                    if name == '_len':
                        seq_len = int(value)
                    else:
                        parsed_hsp['hit'][name] = value
                # for values in the hit block
                else:
                    raise ValueError("Unexpected line: %r" % self.line)
            # otherwise, it must be lines containing the sequences
            else:
                assert '>' not in self.line
                # if we're in hit, parse into hsp.hit
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.fragment.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # we should not get here!
                else:
                    raise ValueError("Unexpected line: %r" % self.line)

            self.line = self.handle.readline()
Пример #4
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        # initial value dummies
        qres_state = None
        file_state = None
        prev_qid = None
        cur, prev = None, None
        # container for Hit objects, used to create QueryResult
        hit_list = []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
            # only parse the result row if it's not EOF
            # NOTE: we are not parsing the extra '#' lines appended to the end
            # of hmmer31b1 tabular results since storing them in qresult
            # objects means we can not do a single-pass parsing
            if self.line and not self.line.startswith('#'):
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
            else:
                file_state = state_EOF
                # mock value for cur_qid, since we have nothing to parse
                cur_qid = None

            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME

            if prev is not None:
                # since domain tab formats only have 1 Hit per line
                # we always create HSPFragment, HSP, and Hit per line
                prev_hid = prev['hit']['id']

                # create fragment and HSP and set their attributes
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)

                # create Hit and set its attributes
                hit = Hit([hsp])
                for attr, value in prev['hit'].items():
                    setattr(hit, attr, value)
                hit_list.append(hit)

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()
Пример #5
0
    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parses a HMMER3 hsp block, beginning with the hsp table."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith('Internal pipeline') or
                         line.startswith('>>'))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith('Internal pipeline'):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith('>>')
            hid, hdesc = self.line[len('>> '):].split('  ', 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(lambda line:
                    line.startswith(' ---   ------ ----- --------') or
                    line.startswith('   [No individual domains'))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if self.line.startswith('   [No targets detected that satisfy') or \
                   self.line.startswith('   [No individual domains') or \
                   self.line.startswith('Internal pipeline statistics summary:') or \
                   self.line.startswith('  Alignments for each domain:') or \
                   self.line.startswith('>>'):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(' ') if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 alphabets are always protein alphabets
                frag.alphabet = generic_protein
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == '!'
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith('  Alignments for each domain:'):
                self._parse_aln_block(hid, hit.hsps)
Пример #6
0
    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummy values
        qres_state = None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur['qname']
                cur_hid = cur['tname']
            else:
                file_state = state_EOF
                # mock values, since we have nothing to parse
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                # create fragment and HSP and set their attributes
                hsp = _create_hsp(prev_hid, prev_qid, prev)
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    # create Hit and set its attributes
                    hit = Hit(hsp_list)
                    hit.seq_len = prev['tsize']
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        qresult.absorb(hit)
                    qresult.seq_len = prev['qsize']
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()