示例#1
0
文件: run_fasta.py 项目: nhoffman/Seq
def get_tup(s, prefix=None):

    key, val = s.strip().split(':')

    key = key.replace('-','_')


    if prefix:
        return '%s_%s' % (prefix, key), cast(val.strip())
    else:
        return key, cast(val.strip())
示例#2
0
def parseNeedle(instr):
    """Extracts various data from output of needle with
    -aformat3 markx10
    return a dict of dicts keyed by string seqid.

    adds keys q_al_enc, t_al_enc to each dict D
    such that the following is true:

    aligned = D['q_al_str']
    degapped = aligned.replace('-','')
    aligned == sequtil.decode_aln(degapped, eval(D['q_al_enc']))
    """

    # do we have the correct format?
    assert instr.find('Align_format: markx10') != -1

    # remove footer
    instr, _, _ = instr.rsplit('#',2)

    # remove header
    datablocks = instr.split('>>>')
    datablocks.pop(0)

    outputData = {}
    for i, block in enumerate(datablocks):

        # lop off the commented data and run parameters
        block = block.split('#=====')[0]
        block = block.split('>>#')[1]

#       log.debug('\n%(i)i ------>\n%(block)s\n<------- %(i)i' % locals())

        # consumes the first line
        align_no, block = block.split('\n',1)

        assert int(align_no) == i+1

        header, query, target = block.split('>')

        # get the sequence names
        q_name, query = query.split('..',1)
        t_name, target = target.split('..',1)

        this_key = tuple(sorted([q_name.strip(), t_name.strip()]))

        ## process the header info
        d = dict([get_tup(e) for e in header.split(';') if e.strip()])

        ## process the query and target
        q_data = dict([get_tup(e, 'q') for e in query.split(';') if e.strip()])
        t_data = dict([get_tup(e, 't') for e in target.split(';') if e.strip()])

        d['align_num'] = i

        ## add data
        d['q_name'] = q_name
        d.update(q_data)

        d['t_name'] = t_name
        d.update(t_data)

        d['q_al_display_start'], q_al_str = d['q_al_display_start'].split('\n',1)
        d['t_al_display_start'], t_al_str = d['t_al_display_start'].split('\n',1)

        d['q_al_str'] = q_al_str.replace('\n','').upper()
        d['t_al_str'] = t_al_str.replace('\n','').upper()

        d['q_al_enc'] = `sequtil.encode_aln(d['q_al_str'], gapchar='-', self_check=True)`
        d['t_al_enc'] = `sequtil.encode_aln(d['t_al_str'], gapchar='-', self_check=True)`

        assert not outputData.has_key(this_key)

        d = dict((k,cast(v)) for k,v in d.items())
        add_calculated_values(d)

        outputData[this_key] = d

#   log.debug('output data:')
#   log.debug(pprint.pformat(outputData))

    return outputData