def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if not opts.sff_fp: parser.error('Required option flowgram file path (-i) not specified') elif not files_exist(opts.sff_fp): parser.error( 'Flowgram file path does not exist:\n %s \n Pass a valid one via -i.' % opts.sff_fp) #make tmp and output dir tmp_dir = get_tmp_filename(tmp_dir=opts.output_dir + "/", suffix="/") try: makedirs(tmp_dir) except OSError: exit("Creating temporary directory failed") if (not exists(opts.output_dir)): try: makedirs(opts.output_dir) except OSError: exit("Creating output directory failed") #open logger log_fh = None if opts.verbose: #append to the log file of the master process log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0) log_fh.write("SFF file: %s\n" % opts.sff_fp) log_fh.write("Fasta file: %s\n" % opts.fasta_fp) log_fh.write("Output dir: %s\n" % opts.output_dir) log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze) log_fh.write("Primer sequence: %s\n" % opts.primer) (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess(opts.sff_fp, log_fh, fasta_fp=opts.fasta_fp, out_fp=tmp_dir, verbose=opts.verbose, squeeze=opts.squeeze, primer=opts.primer) # explicitly close log file, as this file can be shared with the master # Closing it here assures that all preprocess writes happen before the # master writes if log_fh: log_fh.close() #move files to output dir rename(tmp_dir + "/prefix_dereplicated.sff.txt", opts.output_dir + "/prefix_dereplicated.sff.txt") rename(tmp_dir + "/prefix_dereplicated.fasta", opts.output_dir + "/prefix_dereplicated.fasta") rename(tmp_dir + "/prefix_mapping.txt", opts.output_dir + "/prefix_mapping.txt") rmdir(tmp_dir)
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if not opts.sff_fp: parser.error('Required option flowgram file path (-i) not specified') elif not files_exist(opts.sff_fp): parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.' % opts.sff_fp) #make tmp and output dir tmp_dir = get_tmp_filename(tmp_dir = opts.output_dir+"/", suffix="/") try: makedirs(tmp_dir) except OSError: exit("Creating temporary directory failed") if(not exists(opts.output_dir)): try: makedirs(opts.output_dir) except OSError: exit("Creating output directory failed") #open logger log_fh=None if opts.verbose: #append to the log file of the master process log_fh = open(opts.output_dir+"/"+opts.log_fp, "a", 0) log_fh.write("SFF file: %s\n" % opts.sff_fp) log_fh.write("Fasta file: %s\n" % opts.fasta_fp) log_fh.write("Output dir: %s\n" % opts.output_dir) log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze) log_fh.write("Primer sequence: %s\n" % opts.primer) (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess(opts.sff_fp, log_fh, fasta_fp=opts.fasta_fp, out_fp=tmp_dir, verbose=opts.verbose, squeeze=opts.squeeze, primer=opts.primer) # explicitly close log file, as this file can be shared with the master # Closing it here assures that all preprocess writes happen before the # master writes if log_fh: log_fh.close() #move files to output dir rename(tmp_dir+"/prefix_dereplicated.sff.txt", opts.output_dir+"/prefix_dereplicated.sff.txt") rename(tmp_dir+"/prefix_dereplicated.fasta", opts.output_dir+"/prefix_dereplicated.fasta") rename(tmp_dir+"/prefix_mapping.txt", opts.output_dir+"/prefix_mapping.txt") rmdir(tmp_dir)
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) # make tmp and output dir try: tmp_dir = mkdtemp(dir=opts.output_dir, suffix="/") except OSError: exit("Creating temporary directory failed") if(not exists(opts.output_dir)): try: makedirs(opts.output_dir) except OSError: exit("Creating output directory failed") # open logger log_fh = None if opts.verbose: # append to the log file of the master process log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0) log_fh.write("SFF files: %s" % ', '.join(opts.sff_fps)) log_fh.write("Fasta file: %s\n" % opts.fasta_fp) log_fh.write("Output dir: %s\n" % opts.output_dir) log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze) log_fh.write("Primer sequence: %s\n" % opts.primer) (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess(opts.sff_fps, log_fh, fasta_fp=opts.fasta_fp, out_fp=tmp_dir, verbose=opts.verbose, squeeze=opts.squeeze, primer=opts.primer) # explicitly close log file, as this file can be shared with the master # Closing it here assures that all preprocess writes happen before the # master writes if log_fh: log_fh.close() # move files to output dir rename(tmp_dir + "/prefix_dereplicated.sff.txt", opts.output_dir + "/prefix_dereplicated.sff.txt") rename(tmp_dir + "/prefix_dereplicated.fasta", opts.output_dir + "/prefix_dereplicated.fasta") rename( tmp_dir + "/prefix_mapping.txt", opts.output_dir + "/prefix_mapping.txt") rmdir(tmp_dir)
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) # make tmp and output dir try: tmp_dir = mkdtemp(dir=opts.output_dir, suffix="/") except OSError: exit("Creating temporary directory failed") if (not exists(opts.output_dir)): try: makedirs(opts.output_dir) except OSError: exit("Creating output directory failed") # open logger log_fh = None if opts.verbose: # append to the log file of the master process log_fh = open(opts.output_dir + "/" + opts.log_fp, "a", 0) log_fh.write("SFF files: %s" % ', '.join(opts.sff_fps)) log_fh.write("Fasta file: %s\n" % opts.fasta_fp) log_fh.write("Output dir: %s\n" % opts.output_dir) log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze) log_fh.write("Primer sequence: %s\n" % opts.primer) (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess(opts.sff_fps, log_fh, fasta_fp=opts.fasta_fp, out_fp=tmp_dir, verbose=opts.verbose, squeeze=opts.squeeze, primer=opts.primer) # explicitly close log file, as this file can be shared with the master # Closing it here assures that all preprocess writes happen before the # master writes if log_fh: log_fh.close() # move files to output dir rename(tmp_dir + "/prefix_dereplicated.sff.txt", opts.output_dir + "/prefix_dereplicated.sff.txt") rename(tmp_dir + "/prefix_dereplicated.fasta", opts.output_dir + "/prefix_dereplicated.fasta") rename(tmp_dir + "/prefix_mapping.txt", opts.output_dir + "/prefix_mapping.txt") rmdir(tmp_dir)
def denoise_seqs( sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if(checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if(preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif(cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def denoise_seqs(sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if (checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if (preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif (cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")