def test_sort_seqs_by_clustersize(self): """sort_seqs_by_clustersize works""" seqs = { "0": "AAA", "1": "AAT", "2": "ATT", "3": "TTT", "4": "TAA", "5": "TTA", "6": "CCC", "7": "GGG", "8": "GCG", } mapping = {"8": ["7", "6"], "1": ["0", "2", "5"], "4": ["3"]} observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping)) expected = [ ("1", "AAT"), ("8", "GCG"), ("4", "TAA"), ("7", "GGG"), ("6", "CCC"), ("5", "TTA"), ("3", "TTT"), ("2", "ATT"), ("0", "AAA"), ] self.assertEqual(observed, expected)
def test_sort_seqs_by_clustersize(self): """sort_seqs_by_clustersize works""" seqs = { '0': "AAA", '1': "AAT", '2': "ATT", '3': "TTT", '4': "TAA", '5': "TTA", '6': "CCC", '7': "GGG", '8': "GCG" } mapping = {"8": ["7", "6"], "1": ["0", "2", "5"], "4": ["3"]} observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping)) expected = [('1', "AAT"), ('8', "GCG"), ('4', 'TAA'), ('7', 'GGG'), ('6', 'CCC'), ('5', 'TTA'), ('3', 'TTT'), ('2', 'ATT'), ('0', 'AAA')] self.assertEqual(observed, expected)
def test_sort_seqs_by_clustersize(self): """sort_seqs_by_clustersize works""" seqs = {'0': "AAA", '1': "AAT", '2': "ATT", '3': "TTT", '4': "TAA", '5': "TTA", '6': "CCC", '7': "GGG", '8': "GCG"} mapping = {"8":["7","6"], "1":["0","2","5"], "4":["3"]} observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping)) expected = [('1',"AAT"),('8',"GCG"),('4', 'TAA'), ('7', 'GGG'), ('6', 'CCC'), ('5', 'TTA'), ('3', 'TTT'), ('2', 'ATT'), ('0', 'AAA')] self.assertEqual(observed, expected)
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append( parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs( parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir