def cluster_seqs(seqs, neighbor_join=False, params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_chars=1000000, max_hours=1.0, constructor=PhyloNode, clean_up=True ): """Muscle cluster list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError, "Muscle requres 2 or more sequences to cluster." num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 params["-diags1"] = True params["-sv"] = True #params["-distance1"] = "kmer6_6" #params["-distance1"] = "kmer20_3" #params["-distance1"] = "kbit20_3" print "lots of chars, using fast align", num_chars params["-maxhours"] = max_hours #params["-maxiters"] = 10 #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" params["-cluster"] = True params["-tree1"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) if clean_up: muscle_res.cleanUp() return tree
def aln_tree_seqs(seqs, input_handler=None, tree_type='neighborjoining', params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_hours=5.0, constructor=PhyloNode, clean_up=True ): """Muscle align sequences and report tree from iteration2. Unlike cluster_seqs, returns tree2 which is the tree made during the second muscle iteration (it should be more accurate that the cluster from the first iteration which is made fast based on k-mer words) seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. tree_type: can be either neighborjoining (default) or upgmb for UPGMA clean_up: When true, will clean up output files """ params["-maxhours"] = max_hours if tree_type: params["-cluster2"] = tree_type params["-tree2"] = get_tmp_filename(WorkingDir) params["-out"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, input_handler=input_handler, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree2Out"], constructor=constructor) aln = [line for line in muscle_res["MuscleOut"]] if clean_up: muscle_res.cleanUp() return tree, aln
def get_aligned_muscle(seq1,seq2): """Returns aligned sequences and frac_same using MUSCLE. THis needs to be moved to the muscle app controller """ outname = get_tmp_filename() res = muscle_seqs([seq1,seq2], add_seq_names=True, WorkingDir="/tmp", out_filename=outname) #raise ValueError, res['StdErr'].read() #raise ValueError, res seq1_aligned,seq2_aligned =list(MinimalFastaParser(res['MuscleOut'].read())) res.cleanUp() del(res) seq1_aligned = seq1_aligned[1][1:] seq2_aligned = seq2_aligned[1][1:] frac_same = sum(array(seq1_aligned) == array(seq2_aligned))/min(len(seq1), len(seq2)) return seq1_aligned,seq2_aligned,frac_same