def setAlignments( seqs = None, profiles = None, run_id = None, ali_type = 'struct', **kwargs ): ''' Make N structural alignemnts for each of N profiles having the same M sequences apiece. ''' assert seqs; assert run_id; assert profiles ns = len(profiles) nq = len(seqs) rutils = utils if ali_type == 'struct': ali_out = [infernal.alignment(seqs, p, run_id) for p in profiles] alignments = [a[0] for a in ali_out] refs = [[a[1]]*nq for a in ali_out] stks = [a[2] for a in ali_out] pairs = [[rutils.stk_pairs(stk)]*nq for stk in stks] elif ali_type == 'muscle': alignment = muscle.align(seqs) alignments = [alignment] * ns refs = None pairs = None stks = None #raise Exception('MUSCLE ALIGNMENT NOT YET IMPLEMENTED... SORRY :(') #ONCE IMPLEMENTED, WILL FIND HOMOLOGOUS STRUCTURES FOR EACH #PROFILE IN THE GIVEN MULTIPLE SEQUENCE ALIGNMENT. #IT REMAINS UNCLEAR HOW I WILL PUT THESE INTO A TREE AND TRACK THE #STRUCTURAL ELEMENTS THROUGH IT! #all_refs = [] #all_pairs = [] #for i, profile in enumerate(profiles): # these_refs = [] # these_pairs = [] # for j, q in enumerate(seqs): # ali_out = [infernal.alignment([q],profile, run_id)] # these_refs.append(ali_out[0][1]) # stk = ali_out[0][2] # # this_ali = ali_out[0][0] # this_seq = this_ali[0] # all_seqs.append(this_seq) # these_pairs.append( rutils.stk_pairs(stk)) # raise Exception() # all_refs.append(these_refs) # all_pairs.append(these_pairs) refs = None pairs = None else: raise Exception('sorry not implemented') return alignments, refs, pairs
def get_consensus(rfid = 'RF00', mweight = .5, refseq_method = 'root', sp_method = 'sample', aff_type = 'pairs', reset = True, do_plot = False, run_id = 'CONS_TEST'): ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): term_id = re.compile('_([^_]*)_').search(n.name).group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True) big_refnode, big_refseq = \ subtree_refseq(tree, method = refseq_method) ungapped_ref = rutils.ungapped_seq(big_refseq, rfid) #pca_vecs,exemplar_structs = return family_exemplar_structs(rfid, sp_method = sp_method, refseq_method = refseq_method, aff_type = aff_type, ) struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id) clades = split_tree(tree) all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'all_mut':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ]} aamuts, aatimes, aairr, aagaps = [], [], [], [] for idx_clade, c in enumerate(clades): if len(c.get_terminals()) < 3: print 'SKIPPPING CUZ SUBTREE TOO SMALL' continue c_ids = [ n.m['seq'].name for n in c.get_terminals() ] if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0: print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE' continue all_muts, all_times , all_gaps, all_irr = [], [], [], [] print print 'Clade: {0}'.format(idx_clade) for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)): struct_profile, ex_struct = struct_info ngaps = 0 #OLD ALIGNMENTS calis = ba.MultipleSeqAlignment(\ [n.m['seq'] for n in c.get_terminals() ]) #NEW ALIGNMENTS AND REF STRUCTURE c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid) #REF STRUCTURE PAIRS pairs = rutils.stk_pairs(struct) if len(pairs) != len(ex_struct): raise Exception() cterms = c.get_terminals() for i2, ct in enumerate(cterms): lilid = 'N{0}'.format(i2) ct.name = lilid ct.m['str_seq'] = c_new_ali[i2] ct.m['str_seq'].id = lilid ct.m['probs'] = ones(len(c_new_ali[i2])) #BUILD A TREE tr = phy.BaseTree.Tree(c) #RUN PAML paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct) rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id) anc_tree = paml.rst_parser(rstfile) #Label extent and internal nodes with sequences. for term in anc_tree.get_terminals(): #Terminals have old (rfam) alis and new (infernal) alis term.m = filter( lambda x: x.name == term.name, cterms)[0].m for node in anc_tree.get_nonterminals(): #Internals only have new alis. m['seq'] = m['str_seq'] node.m['str_seq'] = node.m['seq'] node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U') subtree = anc_tree #Evaluate all of the structs on the first pass #to have access to mean frequencies of different #mutational types in the final score computation refnode, refseq = subtree_refseq(subtree, method = refseq_method) muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs) all_muts.append(muts) all_times.append(times) all_gaps.append(gaps) all_irr.append(irresolvables) compute_signatures(all_vecs,idx_clade, all_muts,all_times, exemplar_structs,ungapped_ref ) aamuts.append(all_muts) aatimes.append(all_times) aairr.append(all_irr) aagaps.append(all_gaps) outputs = { 'all_vecs':all_vecs, 'all_muts':aamuts, 'all_times':aatimes, 'exemplar_structs':exemplar_structs, 'reference_seq':ungapped_ref, 'thermo_ex_inds':inds, 'thermo_embedding':pca_vecs, 'title':title, 'thermo_aff_type':aff_type, 'tree':tree, 'run_id':run_id } pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w')) return(outputs)