def get_tup(s, prefix=None): key, val = s.strip().split(':') key = key.replace('-','_') if prefix: return '%s_%s' % (prefix, key), cast(val.strip()) else: return key, cast(val.strip())
def parseNeedle(instr): """Extracts various data from output of needle with -aformat3 markx10 return a dict of dicts keyed by string seqid. adds keys q_al_enc, t_al_enc to each dict D such that the following is true: aligned = D['q_al_str'] degapped = aligned.replace('-','') aligned == sequtil.decode_aln(degapped, eval(D['q_al_enc'])) """ # do we have the correct format? assert instr.find('Align_format: markx10') != -1 # remove footer instr, _, _ = instr.rsplit('#',2) # remove header datablocks = instr.split('>>>') datablocks.pop(0) outputData = {} for i, block in enumerate(datablocks): # lop off the commented data and run parameters block = block.split('#=====')[0] block = block.split('>>#')[1] # log.debug('\n%(i)i ------>\n%(block)s\n<------- %(i)i' % locals()) # consumes the first line align_no, block = block.split('\n',1) assert int(align_no) == i+1 header, query, target = block.split('>') # get the sequence names q_name, query = query.split('..',1) t_name, target = target.split('..',1) this_key = tuple(sorted([q_name.strip(), t_name.strip()])) ## process the header info d = dict([get_tup(e) for e in header.split(';') if e.strip()]) ## process the query and target q_data = dict([get_tup(e, 'q') for e in query.split(';') if e.strip()]) t_data = dict([get_tup(e, 't') for e in target.split(';') if e.strip()]) d['align_num'] = i ## add data d['q_name'] = q_name d.update(q_data) d['t_name'] = t_name d.update(t_data) d['q_al_display_start'], q_al_str = d['q_al_display_start'].split('\n',1) d['t_al_display_start'], t_al_str = d['t_al_display_start'].split('\n',1) d['q_al_str'] = q_al_str.replace('\n','').upper() d['t_al_str'] = t_al_str.replace('\n','').upper() d['q_al_enc'] = `sequtil.encode_aln(d['q_al_str'], gapchar='-', self_check=True)` d['t_al_enc'] = `sequtil.encode_aln(d['t_al_str'], gapchar='-', self_check=True)` assert not outputData.has_key(this_key) d = dict((k,cast(v)) for k,v in d.items()) add_calculated_values(d) outputData[this_key] = d # log.debug('output data:') # log.debug(pprint.pformat(outputData)) return outputData