def muscle_seqs(seqs, add_seq_names=False, out_filename=None, input_handler=None, params={}, WorkingDir=None, SuppressStderr=None, SuppressStdout=None): """Muscle align list of sequences. seqs: a list of sequences as strings or objects, you must set add_seq_names=True or sequences in a multiline string, as read() from a fasta file or sequences in a list of lines, as readlines() from a fasta file or a fasta seq filename. == for eg, testcode for guessing #guess_input_handler should correctly identify input gih = guess_input_handler self.assertEqual(gih('abc.txt'), '_input_as_string') self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string') self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs') self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines') == docstring for blast_seqs, apply to muscle_seqs == seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules for data are as follows. If it's s list, treat as lines, unless add_seq_names is true (in which case treat as list of seqs). If it's a string, test whether it has newlines. If it doesn't have newlines, assume it's a filename. If it does have newlines, it can't be a filename, so assume it's a multiline string containing sequences. If you want to skip the detection and force a specific type of input handler, use input_handler='your_favorite_handler'. add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle Addl docs coming soon """ if out_filename: params["-out"] = out_filename #else: # params["-out"] = get_tmp_filename(WorkingDir) ih = input_handler or guess_input_handler(seqs, add_seq_names) muscle_app = Muscle( params=params, InputHandler=ih, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) return muscle_app(seqs)
def blast_seqs(seqs, blast_constructor, blast_db=None, blast_mat_root=None, params={}, add_seq_names=True, out_filename=None, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, input_handler=None, HALT_EXEC=False ): """Blast list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules for data are as follows. If it's s list, treat as lines, unless add_seq_names is true (in which case treat as list of seqs). If it's a string, test whether it has newlines. If it doesn't have newlines, assume it's a filename. If it does have newlines, it can't be a filename, so assume it's a multiline string containing sequences. If you want to skip the detection and force a specific type of input handler, use input_handler='your_favorite_handler'. add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ # set num keep if blast_db: params["-d"] = blast_db if out_filename: params["-o"] = out_filename ih = input_handler or guess_input_handler(seqs, add_seq_names) blast_app = blast_constructor( params=params, blast_mat_root=blast_mat_root, InputHandler=ih, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout, HALT_EXEC=HALT_EXEC) return blast_app(seqs)
def psiblast_n_neighbors(seqs, n=100, blast_db=None, core_threshold=1e-50, extra_threshold=1e-10, lower_threshold=1e-6, step=100, method="two-step", blast_mat_root=None, params={}, add_seq_names=False, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, input_handler=None, scorer=3, #shotgun with 3 hits needed to keep second_db=None ): """PsiBlasts sequences, stopping when n neighbors are reached. core_threshold: threshold for the core profile (default: 1e-50) extra_threshold: threshold for pulling in additional seqs (default:1e-10) lower_threshold: threshold for seqs in final round (default:1e-6) seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. If you want to skip the detection and force a specific type of input handler, use input_handler='your_favorite_handler'. add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ if blast_db: params["-d"] = blast_db ih = input_handler or guess_input_handler(seqs, add_seq_names) recs = seqs_to_stream(seqs, ih) #checkpointing can only handle one seq... #set up the parameters for the core and additional runs max_iterations = params['-j'] params['-j'] = 2 #won't checkpoint with single iteration app = PsiBlast(params=params, blast_mat_root=blast_mat_root, InputHandler='_input_as_lines', WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout, ) result = {} for seq in recs: query_id = seq[0][1:].split(None,1)[0] if method == "two-step": result[query_id] = ids_from_seq_two_step(seq, n, max_iterations, \ app, core_threshold, extra_threshold, lower_threshold, second_db) elif method == "lower_threshold": result[query_id] = ids_from_seq_lower_threshold(seq, n, \ max_iterations, app, core_threshold, lower_threshold, step) elif method == "iterative": result[query_id] = ids_from_seqs_iterative(seq, app, \ QMEPsiBlast9, scorer, params['-j'], n) else: raise TypeError, "Got unknown method %s" % method params['-j'] = max_iterations return result