示例#1
0
def muscle_seqs(seqs,
                 add_seq_names=False,
                 out_filename=None,
                 input_handler=None,
                 params={},
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None):
    """Muscle align list of sequences.
    
    seqs: a list of sequences as strings or objects, you must set add_seq_names=True
    or sequences in a multiline string, as read() from a fasta file
    or sequences in a list of lines, as readlines() from a fasta file
    or a fasta seq filename.
    
    == for eg, testcode for guessing
        #guess_input_handler should correctly identify input
        gih = guess_input_handler
        self.assertEqual(gih('abc.txt'), '_input_as_string')
        self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string')
        self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs')
        self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines')
    
    == docstring for blast_seqs, apply to muscle_seqs ==
    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.
    
    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
    for data are as follows. If it's s list, treat as lines, unless
    add_seq_names is true (in which case treat as list of seqs). If it's a
    string, test whether it has newlines. If it doesn't have newlines, assume
    it's a filename. If it does have newlines, it can't be a filename, so
    assume it's a multiline string containing sequences.
    
    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.
    
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    
    Addl docs coming soon
    """
    
    if out_filename:
        params["-out"] = out_filename
    #else:
    #    params["-out"] = get_tmp_filename(WorkingDir)
    
    ih = input_handler or guess_input_handler(seqs, add_seq_names)
    muscle_app = Muscle(
                   params=params,
                   InputHandler=ih,
                   WorkingDir=WorkingDir,
                   SuppressStderr=SuppressStderr,
                   SuppressStdout=SuppressStdout)
    return muscle_app(seqs)
示例#2
0
def muscle_seqs(seqs,
                 add_seq_names=False,
                 out_filename=None,
                 input_handler=None,
                 params={},
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None):
    """Muscle align list of sequences.
    
    seqs: a list of sequences as strings or objects, you must set add_seq_names=True
    or sequences in a multiline string, as read() from a fasta file
    or sequences in a list of lines, as readlines() from a fasta file
    or a fasta seq filename.
    
    == for eg, testcode for guessing
        #guess_input_handler should correctly identify input
        gih = guess_input_handler
        self.assertEqual(gih('abc.txt'), '_input_as_string')
        self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string')
        self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs')
        self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines')
    
    == docstring for blast_seqs, apply to muscle_seqs ==
    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.
    
    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
    for data are as follows. If it's s list, treat as lines, unless
    add_seq_names is true (in which case treat as list of seqs). If it's a
    string, test whether it has newlines. If it doesn't have newlines, assume
    it's a filename. If it does have newlines, it can't be a filename, so
    assume it's a multiline string containing sequences.
    
    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.
    
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    
    Addl docs coming soon
    """
    
    if out_filename:
        params["-out"] = out_filename
    #else:
    #    params["-out"] = get_tmp_filename(WorkingDir)
    
    ih = input_handler or guess_input_handler(seqs, add_seq_names)
    muscle_app = Muscle(
                   params=params,
                   InputHandler=ih,
                   WorkingDir=WorkingDir,
                   SuppressStderr=SuppressStderr,
                   SuppressStdout=SuppressStdout)
    return muscle_app(seqs)
示例#3
0
def blast_seqs(seqs,
                 blast_constructor,
                 blast_db=None,
                 blast_mat_root=None,
                 params={},
                 add_seq_names=True,
                 out_filename=None,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 input_handler=None,
                 HALT_EXEC=False
                 ):
    """Blast list of sequences.

    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.

    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
    for data are as follows. If it's s list, treat as lines, unless
    add_seq_names is true (in which case treat as list of seqs). If it's a
    string, test whether it has newlines. If it doesn't have newlines, assume
    it's a filename. If it does have newlines, it can't be a filename, so
    assume it's a multiline string containing sequences.

    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.

    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """

    # set num keep

    if blast_db:
        params["-d"] = blast_db

    if out_filename:
        params["-o"] = out_filename

    ih = input_handler or guess_input_handler(seqs, add_seq_names)

    blast_app = blast_constructor(
                   params=params,
                   blast_mat_root=blast_mat_root,
                   InputHandler=ih,
                   WorkingDir=WorkingDir,
                   SuppressStderr=SuppressStderr,
                   SuppressStdout=SuppressStdout,
                   HALT_EXEC=HALT_EXEC)

    return blast_app(seqs)
示例#4
0
def blast_seqs(seqs,
               blast_constructor,
               blast_db=None,
               blast_mat_root=None,
               params={},
               add_seq_names=True,
               out_filename=None,
               WorkingDir=None,
               SuppressStderr=None,
               SuppressStdout=None,
               input_handler=None,
               HALT_EXEC=False):
    """Blast list of sequences.

    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.
    
    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules 
    for data are as follows. If it's s list, treat as lines, unless 
    add_seq_names is true (in which case treat as list of seqs). If it's a
    string, test whether it has newlines. If it doesn't have newlines, assume
    it's a filename. If it does have newlines, it can't be a filename, so
    assume it's a multiline string containing sequences.

    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.
   
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """

    # set num keep

    if blast_db:
        params["-d"] = blast_db

    if out_filename:
        params["-o"] = out_filename

    ih = input_handler or guess_input_handler(seqs, add_seq_names)

    blast_app = blast_constructor(params=params,
                                  blast_mat_root=blast_mat_root,
                                  InputHandler=ih,
                                  WorkingDir=WorkingDir,
                                  SuppressStderr=SuppressStderr,
                                  SuppressStdout=SuppressStdout,
                                  HALT_EXEC=HALT_EXEC)

    return blast_app(seqs)
示例#5
0
def psiblast_n_neighbors(seqs,
                 n=100,
                 blast_db=None,
                 core_threshold=1e-50,
                 extra_threshold=1e-10,
                 lower_threshold=1e-6,
                 step=100,
                 method="two-step",
                 blast_mat_root=None,
                 params={},
                 add_seq_names=False,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 input_handler=None,
                 scorer=3,   #shotgun with 3 hits needed to keep
                 second_db=None
                 ):
    """PsiBlasts sequences, stopping when n neighbors are reached.

    core_threshold: threshold for the core profile (default: 1e-50)
    extra_threshold: threshold for pulling in additional seqs (default:1e-10)
    lower_threshold: threshold for seqs in final round (default:1e-6)

    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.
    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.

    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    if blast_db:
        params["-d"] = blast_db

    ih = input_handler or guess_input_handler(seqs, add_seq_names)
    recs = seqs_to_stream(seqs, ih) #checkpointing can only handle one seq...

    #set up the parameters for the core and additional runs
    max_iterations = params['-j']
    params['-j'] = 2    #won't checkpoint with single iteration

    app = PsiBlast(params=params,
                   blast_mat_root=blast_mat_root,
                   InputHandler='_input_as_lines',
                   WorkingDir=WorkingDir,
                   SuppressStderr=SuppressStderr,
                   SuppressStdout=SuppressStdout,
                   )
    result = {}
    for seq in recs:
        query_id = seq[0][1:].split(None,1)[0]
        if method == "two-step":
            result[query_id] = ids_from_seq_two_step(seq, n, max_iterations, \
                app, core_threshold, extra_threshold, lower_threshold, second_db)
        elif method == "lower_threshold":
            result[query_id] = ids_from_seq_lower_threshold(seq, n, \
                max_iterations, app, core_threshold, lower_threshold, step)
        elif method == "iterative":
            result[query_id] = ids_from_seqs_iterative(seq, app, \
               QMEPsiBlast9, scorer, params['-j'], n)
        else:
            raise TypeError("Got unknown method %s" % method)

    params['-j'] = max_iterations
    return result
示例#6
0
def psiblast_n_neighbors(
        seqs,
        n=100,
        blast_db=None,
        core_threshold=1e-50,
        extra_threshold=1e-10,
        lower_threshold=1e-6,
        step=100,
        method="two-step",
        blast_mat_root=None,
        params={},
        add_seq_names=False,
        WorkingDir=None,
        SuppressStderr=None,
        SuppressStdout=None,
        input_handler=None,
        scorer=3,  #shotgun with 3 hits needed to keep
        second_db=None):
    """PsiBlasts sequences, stopping when n neighbors are reached.

    core_threshold: threshold for the core profile (default: 1e-50)
    extra_threshold: threshold for pulling in additional seqs (default:1e-10)
    lower_threshold: threshold for seqs in final round (default:1e-6)

    seqs: either file name or list of sequence objects or list of strings or
    single multiline string containing sequences.
    If you want to skip the detection and force a specific type of input
    handler, use input_handler='your_favorite_handler'.
   
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    if blast_db:
        params["-d"] = blast_db

    ih = input_handler or guess_input_handler(seqs, add_seq_names)
    recs = seqs_to_stream(seqs, ih)  #checkpointing can only handle one seq...

    #set up the parameters for the core and additional runs
    max_iterations = params['-j']
    params['-j'] = 2  #won't checkpoint with single iteration

    app = PsiBlast(
        params=params,
        blast_mat_root=blast_mat_root,
        InputHandler='_input_as_lines',
        WorkingDir=WorkingDir,
        SuppressStderr=SuppressStderr,
        SuppressStdout=SuppressStdout,
    )
    result = {}
    for seq in recs:
        query_id = seq[0][1:].split(None, 1)[0]
        if method == "two-step":
            result[query_id] = ids_from_seq_two_step(seq, n, max_iterations, \
                app, core_threshold, extra_threshold, lower_threshold, second_db)
        elif method == "lower_threshold":
            result[query_id] = ids_from_seq_lower_threshold(seq, n, \
                max_iterations, app, core_threshold, lower_threshold, step)
        elif method == "iterative":
            result[query_id] = ids_from_seqs_iterative(seq, app, \
               QMEPsiBlast9, scorer, params['-j'], n)
        else:
            raise TypeError, "Got unknown method %s" % method

    params['-j'] = max_iterations
    return result