Пример #1
0
def setAlignments( seqs = None, profiles = None, run_id = None, ali_type = 'struct',
                   **kwargs ):
    '''
Make N structural alignemnts for each of N profiles
having the same M sequences apiece. 
'''
    assert seqs; assert run_id; assert profiles
    
    ns = len(profiles)
    nq = len(seqs)
    rutils = utils

    if ali_type == 'struct':
        ali_out = [infernal.alignment(seqs, p, run_id) for p in profiles]
        alignments = [a[0] for a in ali_out]
        refs = [[a[1]]*nq for a in ali_out]
        stks =  [a[2] for a in ali_out]
        pairs = [[rutils.stk_pairs(stk)]*nq for stk in stks] 
    elif ali_type == 'muscle':
        alignment = muscle.align(seqs)
        alignments = [alignment] * ns     
        
        refs = None
        pairs = None
        stks = None
        #raise Exception('MUSCLE ALIGNMENT NOT YET IMPLEMENTED... SORRY :(')
        #ONCE IMPLEMENTED, WILL FIND HOMOLOGOUS STRUCTURES FOR EACH
        #PROFILE IN THE GIVEN MULTIPLE SEQUENCE ALIGNMENT.

        #IT REMAINS UNCLEAR HOW I WILL PUT THESE INTO A TREE AND TRACK THE
        #STRUCTURAL ELEMENTS THROUGH IT!

        #all_refs = []
        #all_pairs = []
        #for i, profile  in enumerate(profiles):
        #    these_refs = []
        #    these_pairs = []
        #    for j, q in enumerate(seqs):
        #        ali_out = [infernal.alignment([q],profile, run_id)]
        #        these_refs.append(ali_out[0][1])
        #        stk = ali_out[0][2]
        #        
        #        this_ali = ali_out[0][0]
        #        this_seq = this_ali[0]
        #        all_seqs.append(this_seq)
        #        these_pairs.append( rutils.stk_pairs(stk))
        #        raise Exception()
        #    all_refs.append(these_refs)
        #    all_pairs.append(these_pairs)
        refs = None
        pairs = None
                
    else:  
        raise Exception('sorry not implemented')
    return alignments, refs, pairs
Пример #2
0
def get_consensus(rfid = 'RF00', mweight = .5, 
                  refseq_method = 'root', sp_method = 'sample',
                  aff_type = 'pairs',  reset = True,
                  do_plot = False,  run_id = 'CONS_TEST'):

    ali, tree, infos = rfam.get_fam(rfid)
    ali_ids = [a.name for a in ali]

    for i, n in enumerate(tree.get_terminals()):
        term_id = re.compile('_([^_]*)_').search(n.name).group(1) 
        this_seq = ali[ali_ids.index(term_id)]
        n.m = {'seq':this_seq,
               'probs':[1 for j in range(len(this_seq))]}

    #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True)
    

    big_refnode, big_refseq = \
        subtree_refseq(tree, method = refseq_method)
    ungapped_ref = rutils.ungapped_seq(big_refseq, rfid)
    #pca_vecs,exemplar_structs =
    return family_exemplar_structs(rfid,
                                   sp_method = sp_method,
                                   refseq_method = refseq_method,
                                   aff_type = aff_type,
                                   )
    struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id)

    clades = split_tree(tree)
    all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'all_mut':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ]}

    aamuts, aatimes, aairr, aagaps = [], [], [], []
    for idx_clade, c in enumerate(clades):
        if len(c.get_terminals()) < 3:
		print 'SKIPPPING CUZ SUBTREE TOO SMALL'
		continue
	c_ids = [ n.m['seq'].name for n in c.get_terminals() ]
	if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0:
		print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE'
		continue          
        all_muts, all_times , all_gaps, all_irr = [], [], [], []
	print
	print 'Clade: {0}'.format(idx_clade)
        for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)):
          struct_profile, ex_struct = struct_info
	  ngaps = 0

          #OLD ALIGNMENTS
          calis = ba.MultipleSeqAlignment(\
              [n.m['seq'] for n in c.get_terminals() ])
          #NEW ALIGNMENTS AND REF STRUCTURE
          c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid)
          #REF STRUCTURE PAIRS
          pairs = rutils.stk_pairs(struct)
	  if len(pairs) != len(ex_struct):
		  raise Exception()
           
          cterms = c.get_terminals()
          for i2, ct in enumerate(cterms):
              lilid =  'N{0}'.format(i2)
              ct.name = lilid
              ct.m['str_seq'] = c_new_ali[i2]
              ct.m['str_seq'].id = lilid
	      ct.m['probs'] = ones(len(c_new_ali[i2]))
          
          #BUILD A TREE
          tr = phy.BaseTree.Tree(c)

          #RUN PAML
          paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct)
          rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id)
          anc_tree = paml.rst_parser(rstfile) 

          #Label extent and internal nodes with sequences.
          for term in anc_tree.get_terminals():
              #Terminals have old (rfam) alis and new (infernal) alis
              term.m = filter( lambda x: x.name == term.name, cterms)[0].m
          for node in anc_tree.get_nonterminals():
              #Internals only have new alis. m['seq'] = m['str_seq']
              node.m['str_seq'] = node.m['seq']
              node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U')
          subtree = anc_tree
              
 
          #Evaluate all of the structs on the first pass
          #to have access to mean frequencies of different
          #mutational types in the final score computation
	  
          refnode, refseq = subtree_refseq(subtree, method = refseq_method)
          muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs)
          all_muts.append(muts)
          all_times.append(times)
	  all_gaps.append(gaps)
	  all_irr.append(irresolvables)
        
	compute_signatures(all_vecs,idx_clade,
			   all_muts,all_times,
			   exemplar_structs,ungapped_ref )
				      
	aamuts.append(all_muts)
	aatimes.append(all_times)
	aairr.append(all_irr)
	aagaps.append(all_gaps)
    outputs = {
	    'all_vecs':all_vecs,
	    'all_muts':aamuts,
	    'all_times':aatimes,
	    'exemplar_structs':exemplar_structs,
	    'reference_seq':ungapped_ref,
	    'thermo_ex_inds':inds,
	    'thermo_embedding':pca_vecs,
	    'title':title,
	    'thermo_aff_type':aff_type,
	    'tree':tree,
	    'run_id':run_id
	    }
	 
    pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w'))
    return(outputs)