예제 #1
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error(
            'Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
            % opts.sff_fp)

    #make tmp and output dir
    tmp_dir = get_tmp_filename(tmp_dir=opts.output_dir + "/", suffix="/")
    try:
        makedirs(tmp_dir)
    except OSError:
        exit("Creating temporary directory failed")
    if (not exists(opts.output_dir)):
        try:
            makedirs(opts.output_dir)
        except OSError:
            exit("Creating output directory failed")

    #open logger
    log_fh = None
    if opts.verbose:
        #append to the log file of the master process
        log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0)
        log_fh.write("SFF file: %s\n" % opts.sff_fp)
        log_fh.write("Fasta file: %s\n" % opts.fasta_fp)
        log_fh.write("Output dir: %s\n" % opts.output_dir)
        log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze)
        log_fh.write("Primer sequence: %s\n" % opts.primer)

    (deprefixed_sff_fp, l, mapping, seqs) = \
        preprocess(opts.sff_fp, log_fh, fasta_fp=opts.fasta_fp,
                   out_fp=tmp_dir,
                   verbose=opts.verbose, squeeze=opts.squeeze,
                   primer=opts.primer)

    # explicitly close log file, as this file can be shared with the master
    # Closing it here assures that all preprocess writes happen before the
    # master writes
    if log_fh:
        log_fh.close()

    #move files to output dir
    rename(tmp_dir + "/prefix_dereplicated.sff.txt",
           opts.output_dir + "/prefix_dereplicated.sff.txt")
    rename(tmp_dir + "/prefix_dereplicated.fasta",
           opts.output_dir + "/prefix_dereplicated.fasta")
    rename(tmp_dir + "/prefix_mapping.txt",
           opts.output_dir + "/prefix_mapping.txt")
    rmdir(tmp_dir)
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)
     
    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp) 

    #make tmp and output dir
    tmp_dir = get_tmp_filename(tmp_dir = opts.output_dir+"/", suffix="/")
    try:
        makedirs(tmp_dir)
    except OSError:
        exit("Creating temporary directory failed")
    if(not exists(opts.output_dir)):
        try:
            makedirs(opts.output_dir)
        except OSError:
            exit("Creating output directory failed")
            
    #open logger
    log_fh=None
    if opts.verbose:
        #append to the log file of the master process
        log_fh = open(opts.output_dir+"/"+opts.log_fp, "a", 0)
        log_fh.write("SFF file: %s\n" % opts.sff_fp)
        log_fh.write("Fasta file: %s\n" % opts.fasta_fp)
        log_fh.write("Output dir: %s\n" % opts.output_dir)
        log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze)
        log_fh.write("Primer sequence: %s\n" % opts.primer)

    (deprefixed_sff_fp, l, mapping, seqs) = \
        preprocess(opts.sff_fp, log_fh, fasta_fp=opts.fasta_fp,
                   out_fp=tmp_dir,
                   verbose=opts.verbose, squeeze=opts.squeeze,
                   primer=opts.primer)
        
    # explicitly close log file, as this file can be shared with the master
    # Closing it here assures that all preprocess writes happen before the
    # master writes    
    if log_fh:
        log_fh.close()

    #move files to output dir
    rename(tmp_dir+"/prefix_dereplicated.sff.txt",
           opts.output_dir+"/prefix_dereplicated.sff.txt")
    rename(tmp_dir+"/prefix_dereplicated.fasta",
           opts.output_dir+"/prefix_dereplicated.fasta")
    rename(tmp_dir+"/prefix_mapping.txt", opts.output_dir+"/prefix_mapping.txt")
    rmdir(tmp_dir)
예제 #3
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    # make tmp and output dir
    try:
        tmp_dir = mkdtemp(dir=opts.output_dir, suffix="/")
    except OSError:
        exit("Creating temporary directory failed")
    if(not exists(opts.output_dir)):
        try:
            makedirs(opts.output_dir)
        except OSError:
            exit("Creating output directory failed")

    # open logger
    log_fh = None
    if opts.verbose:
        # append to the log file of the master process
        log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0)
        log_fh.write("SFF files: %s" % ', '.join(opts.sff_fps))
        log_fh.write("Fasta file: %s\n" % opts.fasta_fp)
        log_fh.write("Output dir: %s\n" % opts.output_dir)
        log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze)
        log_fh.write("Primer sequence: %s\n" % opts.primer)

    (deprefixed_sff_fp, l, mapping, seqs) = \
        preprocess(opts.sff_fps, log_fh, fasta_fp=opts.fasta_fp,
                   out_fp=tmp_dir,
                   verbose=opts.verbose, squeeze=opts.squeeze,
                   primer=opts.primer)

    # explicitly close log file, as this file can be shared with the master
    # Closing it here assures that all preprocess writes happen before the
    # master writes
    if log_fh:
        log_fh.close()

    # move files to output dir
    rename(tmp_dir + "/prefix_dereplicated.sff.txt",
           opts.output_dir + "/prefix_dereplicated.sff.txt")
    rename(tmp_dir + "/prefix_dereplicated.fasta",
           opts.output_dir + "/prefix_dereplicated.fasta")
    rename(
        tmp_dir +
        "/prefix_mapping.txt",
        opts.output_dir +
        "/prefix_mapping.txt")
    rmdir(tmp_dir)
예제 #4
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    # make tmp and output dir
    try:
        tmp_dir = mkdtemp(dir=opts.output_dir, suffix="/")
    except OSError:
        exit("Creating temporary directory failed")
    if (not exists(opts.output_dir)):
        try:
            makedirs(opts.output_dir)
        except OSError:
            exit("Creating output directory failed")

    # open logger
    log_fh = None
    if opts.verbose:
        # append to the log file of the master process
        log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0)
        log_fh.write("SFF files: %s" % ', '.join(opts.sff_fps))
        log_fh.write("Fasta file: %s\n" % opts.fasta_fp)
        log_fh.write("Output dir: %s\n" % opts.output_dir)
        log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze)
        log_fh.write("Primer sequence: %s\n" % opts.primer)

    (deprefixed_sff_fp, l, mapping, seqs) = \
        preprocess(opts.sff_fps, log_fh, fasta_fp=opts.fasta_fp,
                   out_fp=tmp_dir,
                   verbose=opts.verbose, squeeze=opts.squeeze,
                   primer=opts.primer)

    # explicitly close log file, as this file can be shared with the master
    # Closing it here assures that all preprocess writes happen before the
    # master writes
    if log_fh:
        log_fh.close()

    # move files to output dir
    rename(tmp_dir + "/prefix_dereplicated.sff.txt",
           opts.output_dir + "/prefix_dereplicated.sff.txt")
    rename(tmp_dir + "/prefix_dereplicated.fasta",
           opts.output_dir + "/prefix_dereplicated.fasta")
    rename(tmp_dir + "/prefix_mapping.txt",
           opts.output_dir + "/prefix_mapping.txt")
    rmdir(tmp_dir)
예제 #5
0
def denoise_seqs(
        sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False,
        num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="",
        low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log",
        low_memory=False, verbose=False,
        error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
        max_num_rounds=None, titanium=False, checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if(checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if(preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif(cluster):
            preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir, verbose=verbose,
                                  squeeze=squeeze, primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file, mapping, bestscores, log_fh,
                         verbose=verbose, threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
예제 #6
0
def denoise_seqs(sff_fps,
                 fasta_fp,
                 tmpoutdir,
                 preprocess_fp=None,
                 cluster=False,
                 num_cpus=1,
                 squeeze=True,
                 percent_id=0.97,
                 bail=1,
                 primer="",
                 low_cutoff=3.75,
                 high_cutoff=4.5,
                 log_fp="denoiser.log",
                 low_memory=False,
                 verbose=False,
                 error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
                 max_num_rounds=None,
                 titanium=False,
                 checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if (checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if (preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif (cluster):
            preprocess_on_cluster(sff_fps,
                                  log_fp,
                                  fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir,
                                  verbose=verbose,
                                  squeeze=squeeze,
                                  primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         verbose=verbose,
                         threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")