def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n' + 'Pass a valid one via -i.') % f) outdir = opts.output_dir create_dir(outdir, fail_on_exist=not opts.force) log_fh = None if (not (opts.primer or opts.map_fname)): raise ApplicationError("Either mapping file or primer required") # Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname, "U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:, index]) if len(all_primers) != 1: raise ValueError( "Currently only data sets with one primer are allowed.\n" + "Make separate mapping files with only one primer, re-run split_libraries and\n" + "denoise with each split_library output separately.") primer = list(all_primers)[0] last_char = primer[-1] if (last_char not in "ACGT"): raise ValueError("We currently do not support primer with " + "degenerate bases at it's 3' end.") else: primer = opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path, 'w') for i, cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i), '\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path, 'w') write_Fasta_from_name_seq_pairs(centroids, oh)
def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n' + 'Pass a valid one via -i.') % f) outdir = opts.output_dir create_dir(outdir, fail_on_exist=not opts.force) log_fh = None if (not (opts.primer or opts.map_fname)): raise ApplicationError("Either mapping file or primer required") # Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname, "U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:, index]) if len(all_primers) != 1: raise ValueError("Currently only data sets with one primer are allowed.\n" + "Make separate mapping files with only one primer, re-run split_libraries and\n" + "denoise with each split_library output separately.") primer = list(all_primers)[0] last_char = primer[-1] if(last_char not in "ACGT"): raise ValueError("We currently do not support primer with " + "degenerate bases at it's 3' end.") else: primer = opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path, 'w') for i, cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i), '\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path, 'w') write_Fasta_from_name_seq_pairs(centroids, oh)
def test_write_Fasta_from_name_seqs_pairs(self): """write_Fasta_from_name_seqs_pairs write proper FASTA string.""" seqs = [('1',"AAA"),('2',"CCCCC"),('3',"GGGG")] #None fh raises Error self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs,seqs,None) tmp_filename = get_tmp_filename(prefix="test_write_Fasta", suffix=".fna") fh = open(tmp_filename,"w") write_Fasta_from_name_seq_pairs(seqs,fh) fh.close() actual_seqs = list(MinimalFastaParser(open(tmp_filename,"U"))) remove(tmp_filename) self.assertEqual(actual_seqs, seqs)
def test_write_Fasta_from_name_seqs_pairs(self): """write_Fasta_from_name_seqs_pairs write proper FASTA string.""" seqs = [('1',"AAA"),('2',"CCCCC"),('3',"GGGG")] #None fh raises Error self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs,seqs,None) tmp_filename = get_tmp_filename(prefix="test_write_Fasta", suffix=".fna") fh = open(tmp_filename,"w") write_Fasta_from_name_seq_pairs(seqs,fh) fh.close() actual_seqs = list(MinimalFastaParser(open(tmp_filename,"U"))) remove(tmp_filename) self.assertEqual(actual_seqs, seqs)
def test_write_Fasta_from_name_seqs_pairs(self): """write_Fasta_from_name_seqs_pairs write proper FASTA string.""" seqs = [("1", "AAA"), ("2", "CCCCC"), ("3", "GGGG")] # None fh raises Error self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs, seqs, None) fd, tmp_filename = mkstemp(prefix="test_write_Fasta", suffix=".fna") close(fd) fh = open(tmp_filename, "w") write_Fasta_from_name_seq_pairs(seqs, fh) fh.close() actual_seqs = list(parse_fasta(open(tmp_filename, "U"))) remove(tmp_filename) self.assertEqual(actual_seqs, seqs)
def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n'+\ 'Pass a valid one via -i.')% f) outdir = opts.output_dir ret_val = create_dir(outdir, handle_errors_externally=True) if ret_val == 1: #dir exists if opts.force: #do nothing, just overwrite content pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose"+\ " a different directory, or force overwrite with -f.") else: handle_error_codes(outdir, error_code=ret_val) log_fh = None if (not (opts.primer or opts.map_fname)): raise ApplicationError, "Either mapping file or primer required" #Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname,"U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:, index]) if len(all_primers) != 1: raise ValueError,"Currently only data sets with one primer are allowed.\n"+\ "Make separate mapping files with only one primer, re-run split_libraries and\n"\ +"denoise with each split_library output separately." primer = list(all_primers)[0] last_char = primer[-1] if (last_char not in "ACGT"): raise ValueError,"We currently do not support primer with "+\ "degenerate bases at it's 3' end." else: primer = opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path, 'w') for i, cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i), '\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path, 'w') write_Fasta_from_name_seq_pairs(centroids, oh)
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append( parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs( parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n'+\ 'Pass a valid one via -i.')% f) outdir = opts.output_dir ret_val = create_dir(outdir, handle_errors_externally=True) if ret_val==1: #dir exists if opts.force: #do nothing, just overwrite content pass else: # Since the analysis can take quite a while, I put this check # in to help users avoid overwriting previous output. option_parser.error("Output directory already exists. Please choose"+\ " a different directory, or force overwrite with -f.") else: handle_error_codes(outdir, error_code=ret_val) log_fh=None if (not (opts.primer or opts.map_fname)): raise ApplicationError, "Either mapping file or primer required" #Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname,"U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:,index]) if len(all_primers)!= 1: raise ValueError,"Currently only data sets with one primer are allowed.\n"+\ "Make separate mapping files with only one primer, re-run split_libraries and\n"\ +"denoise with each split_library output separately." primer = list(all_primers)[0] last_char = primer[-1] if(last_char not in "ACGT"): raise ValueError,"We currently do not support primer with "+\ "degenerate bases at it's 3' end." else: primer=opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps,opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path,'w') for i,cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i),'\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path,'w') write_Fasta_from_name_seq_pairs(centroids, oh)