def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = SequenceCollection.from_fasta_records( [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA) infile = in_seqs.to_fasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def fast_denoiser( sff_fps, fasta_fp, tmp_outdir, num_cpus, primer, verbose=True, titanium=False): """wrapper function calling methods from the Denoiser package.""" if num_cpus > 1: denoise_seqs(sff_fps, fasta_fp, tmp_outdir, primer=primer, cluster=True, num_cpus=num_cpus, verbose=verbose, titanium=titanium) else: denoise_seqs(sff_fps, fasta_fp, tmp_outdir, primer=primer, verbose=verbose, titanium=titanium) # read centroids and singletons centroids = parse_fasta(open(tmp_outdir + "/centroids.fasta")) singletons = parse_fasta(open(tmp_outdir + "/singletons.fasta")) seqs = chain(centroids, singletons) # read mapping mapping = {} cluster_mapping = open(tmp_outdir + "/denoiser_mapping.txt") for i, cluster in enumerate(cluster_mapping): cluster, members = cluster.split(':') members = members.split() clust = [cluster] clust.extend(members) mapping[i] = clust return seqs, mapping
def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # Trying with a numpy array error_dist = np.array([ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ]) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) self.assertEqual(obs, exp)
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def fast_denoiser(sff_fps, fasta_fp, tmp_outdir, num_cpus, primer, verbose=True, titanium=False): """wrapper function calling methods from the Denoiser package.""" if num_cpus > 1: denoise_seqs( sff_fps, fasta_fp, tmp_outdir, primer=primer, cluster=True, num_cpus=num_cpus, verbose=verbose, titanium=titanium, ) else: denoise_seqs(sff_fps, fasta_fp, tmp_outdir, primer=primer, verbose=verbose, titanium=titanium) # read centroids and singletons centroids = parse_fasta(open(tmp_outdir + "/centroids.fasta")) singletons = parse_fasta(open(tmp_outdir + "/singletons.fasta")) seqs = chain(centroids, singletons) # read mapping mapping = {} cluster_mapping = open(tmp_outdir + "/denoiser_mapping.txt") for i, cluster in enumerate(cluster_mapping): cluster, members = cluster.split(":") members = members.split() clust = [cluster] clust.extend(members) mapping[i] = clust return seqs, mapping
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def setUp(self): """ """ self.fasta_lines1 = fasta_lines1.split("\n") self.fasta_lines1_mixed_case = fasta_lines1_mixed_case.split("\n") self.fasta_lines1_exp = list(parse_fasta(fasta_lines1_exp.split("\n"))) self.fasta_lines1_mixed_case_exp = list(parse_fasta(fasta_lines1_mixed_case_exp.split("\n"))) self.fasta_lines1_exp_null_desc_mapper = list(parse_fasta(fasta_lines1_exp_null_desc_mapper.split("\n")))
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError('Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def setUp(self): """ """ self.fasta_lines1 = fasta_lines1.split('\n') self.fasta_lines1_mixed_case = fasta_lines1_mixed_case.split('\n') self.fasta_lines1_exp = list(parse_fasta(fasta_lines1_exp.split('\n'))) self.fasta_lines1_mixed_case_exp = list( parse_fasta(fasta_lines1_mixed_case_exp.split('\n'))) self.fasta_lines1_exp_null_desc_mapper = list( parse_fasta(fasta_lines1_exp_null_desc_mapper.split('\n')))
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) with open(self.pynast_test1_input_fp, "w") as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test1_template_fp, "w") as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_dots_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("-", ".")) fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_u_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("T", "U")) fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_lower_fp, "w") as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.failure_fp, "w").close() fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp, ] self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15}) self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA )
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005, ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag", ) ] # Trying with a numpy array error_dist = np.array( [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005, ] ) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) self.assertEqual(obs, exp)
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in parse_fasta(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={'Algorithm': 'first', 'ChoiceF': first_id}) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def check_fasta_seqs_lens(input_fasta_fp): """ Creates bins of sequence lens Useful for checking for valid aligned sequences. input_fasta_fp: input fasta filepath """ seq_lens = defaultdict(int) input_fasta_f = open(input_fasta_fp, "U") for label, seq in parse_fasta(input_fasta_f): seq_lens[len(seq)] += 1 input_fasta_f.close() formatted_seq_lens = [] for curr_key in seq_lens: formatted_seq_lens.append((seq_lens[curr_key], curr_key)) formatted_seq_lens.sort(reverse=True) return formatted_seq_lens
def output_test(self, aligned_basename): """ Test results of test_load_zip() and test_load_gzip() """ f_log = open(aligned_basename + ".log", "U") f_log_str = f_log.read() self.assertTrue("Total reads passing E-value threshold" in f_log_str) self.assertTrue("Total reads for de novo clustering" in f_log_str) self.assertTrue("Total OTUs" in f_log_str) f_log.seek(0) for line in f_log: if line.startswith(" Total reads passing E-value threshold"): num_hits = (re.split( 'Total reads passing E-value threshold = | \(', line)[1]).strip() elif line.startswith(" Total reads for de novo clustering"): num_failures_log = (re.split( 'Total reads for de novo clustering = ', line)[1]).strip() elif line.startswith(" Total OTUs"): num_clusters_log = (re.split('Total OTUs = ', line)[1]).strip() f_log.close() # Correct number of reads mapped self.assertEqual("99999", num_hits) # Correct number of clusters recorded self.assertEqual("272", num_clusters_log) # Correct number of clusters in OTU-map with open(aligned_basename + "_otus.txt", 'U') as f_otumap: num_clusters_file = sum(1 for line in f_otumap) self.assertEqual(272, num_clusters_file) num_failures_file = 0 with open(aligned_basename + "_denovo.fasta", 'U') as f_denovo: for label, seq in parse_fasta(f_denovo): num_failures_file += 1 # Correct number of reads for de novo clustering self.assertEqual(num_failures_log, str(num_failures_file))
def output_test(self, aligned_basename): """ Test results of test_load_zip() and test_load_gzip() """ f_log = open(aligned_basename + ".log", "U") f_log_str = f_log.read() self.assertTrue("Total reads passing E-value threshold" in f_log_str) self.assertTrue("Total reads for de novo clustering" in f_log_str) self.assertTrue("Total OTUs" in f_log_str) f_log.seek(0) for line in f_log: if line.startswith(" Total reads passing E-value threshold"): num_hits = (re.split('Total reads passing E-value threshold = | \(', line)[1]).strip() elif line.startswith(" Total reads for de novo clustering"): num_failures_log = (re.split('Total reads for de novo clustering = ', line)[1]).strip() elif line.startswith(" Total OTUs"): num_clusters_log = (re.split('Total OTUs = ', line)[1]).strip() f_log.close() # Correct number of reads mapped self.assertEqual("99999", num_hits) # Correct number of clusters recorded self.assertEqual("272", num_clusters_log) # Correct number of clusters in OTU-map with open(aligned_basename + "_otus.txt", 'U') as f_otumap: num_clusters_file = sum(1 for line in f_otumap) self.assertEqual(272, num_clusters_file) num_failures_file = 0 with open(aligned_basename + "_denovo.fasta", 'U') as f_denovo: for label, seq in parse_fasta(f_denovo): num_failures_file += 1 # Correct number of reads for de novo clustering self.assertEqual(num_failures_log, str(num_failures_file))
def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. - Mafft profile alignment only works with aligned sequences. Alignment object used to handle unaligned sequences. params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_paths',\ params=params, SuppressStderr=False) app._command = 'mafft-profile' aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta()) aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta()) filepaths = [aln1_path,aln2_path] #Get results using int_map as input to app res = app(filepaths) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[aln1_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(aln1_path) remove(aln2_path) remove('pre') remove('trace') del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def test_main(self): """Denoiser should always give same result on test data""" expected = ">FS8APND01D3TW3 | cluster size: 94 \nCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC\n" expected_map = """FS8APND01EWRS4: FS8APND01DXG45: FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN FS8APND01BSTVP: FS8APND01EFK0W: FS8APND01DCIOO: FS8APND01CKOMZ: """ command = " ".join(["denoiser.py", "--force", "-o", self.test_dir, "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME]) result = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT).stdout.read() self.result_dir = self.test_dir observed = "".join(list(open(self.result_dir + "centroids.fasta"))) self.assertEqual(observed, expected) self.assertEqual( len(list(parse_fasta(open(self.result_dir + "singletons.fasta")))), 6) observed = "".join( list(open(self.result_dir + "denoiser_mapping.txt"))) self.assertEqual(observed, expected_map)
def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(parse_fasta(QUERY_SEQS.split())) # generate temp filename tmp_dir='/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta=splitext(self.outfile)[0]+'.fasta' fastaout=open(self.outfasta,'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree=splitext(self.outfile)[0]+'.tree' treeout=open(self.outtree,'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def _split_along_prefix(self, input_fp, params, jobs_to_start, job_prefix, output_dir): """ Split input sequences into sets with identical prefix""" out_files = [] buffered_handles = {} prefix_length = params['prefix_length'] or 1 for seq_id, seq in parse_fasta(open(input_fp)): if(len(seq) < prefix_length): raise ValueError("Prefix length must be equal or longer than sequence.\n" + " Found seq %s with length %d" % (seq_id, len(seq))) prefix = seq[:prefix_length] if (prefix not in buffered_handles): # never seen this prefix before out_fp = "%s/%s%s" % (output_dir, job_prefix, prefix) buffered_handles[prefix] = BufferedWriter(out_fp) out_files.append(out_fp) self.prefix_counts[prefix] = 0 self.prefix_counts[prefix] += 1 buffered_handles[prefix].write('>%s\n%s\n' % (seq_id, seq)) # make sure all buffers are closed and flushed for buf_fh in buffered_handles.itervalues(): buf_fh.close() remove_files = True return out_files, remove_files
def _generate_training_files(self): """Returns a tuple of file objects suitable for passing to the RdpTrainer application controller. """ tmp_dir = get_qiime_temp_dir() training_set = RdpTrainingSet() reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U') id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U') for seq_id, seq in parse_fasta(reference_seqs_file): training_set.add_sequence(seq_id, seq) for line in id_to_taxonomy_file: seq_id, lineage_str = map(strip, line.split('\t')) training_set.add_lineage(seq_id, lineage_str) training_set.dereplicate_taxa() rdp_taxonomy_file = NamedTemporaryFile( prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir) rdp_taxonomy_file.write(training_set.get_rdp_taxonomy()) rdp_taxonomy_file.seek(0) rdp_training_seqs_file = NamedTemporaryFile( prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta', dir=tmp_dir) for rdp_id, seq in training_set.get_training_seqs(): rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq)) rdp_training_seqs_file.seek(0) self._training_set = training_set return rdp_taxonomy_file, rdp_training_seqs_file
def seqs_from_file(ids, file_lines): """Extract labels and seqs from file""" for label, seq in parse_fasta(file_lines): if id_from_fasta_label_line(label) in ids: yield label, seq
def test_dereplicate_seqs_remove_singletons(self): """ Test dereplicate_seqs() method functionality with removing singletons """ seqs = [("seq1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq2", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq4", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"), ("seq5", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG"), ("seq6", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"), ("seq7", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) output_fp = join(self.working_dir, "seqs_derep.fasta") log_fp = join(self.working_dir, "seqs_derep.log") dereplicate_seqs(seqs_fp=seqs_fp, output_fp=output_fp) self.assertTrue(isfile(output_fp)) self.assertTrue(isfile(log_fp)) exp = [("seq1;size=3;", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"), ("seq6;size=2;", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")] with open(output_fp, 'U') as out_f: act = [item for item in parse_fasta(out_f)] self.assertEqual(act, exp)
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: # reading in the binary sff usually shows up as ValueError raise FileFormatError('Wrong flogram file format. Make sure you pass the sff.txt format ' + 'produced by sffinfo. The binary .sff will not work here.') (inverse_map, map_count) = build_inverse_barcode_map( parse_fasta(map_file_handle)) filenames = [] # we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time # First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir + barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir + barcode_id) # Then direct each flowgram into its barcode file for f in flowgrams: if f.Name in inverse_map: barcode_id = inverse_map[f.Name] fh = open(outdir + barcode_id, "a") fh.write(f.createFlowHeader() + "\n") return filenames
def get_seqs_to_keep_lookup_from_sample_ids(fasta_f, sample_ids): sample_ids = set(sample_ids) seqs_to_keep = set() for seq_id, seq in parse_fasta(fasta_f): if seq_id.split('_')[0] in sample_ids: seqs_to_keep.add(seq_id) return {}.fromkeys(seqs_to_keep)
def sort_fasta_by_abundance(fasta_lines, fasta_out_f): """ Sort seqs in fasta_line by abundance, write all seqs to fasta_out_f Note that all sequences are written out, not just unique ones. fasta_lines: input file handle (or similar object) fasta_out_f: output file handle (or similar object) ** The current implementation works well for fairly large data sets, (e.g., several combined 454 runs) but we may want to revisit if it chokes on very large (e.g., Illumina) files. --Greg ** """ seq_index = {} count = 0 for seq_id, seq in parse_fasta(fasta_lines): count += 1 try: seq_index[seq].append(seq_id) except KeyError: seq_index[seq] = [seq_id] seqs = [] for k, v in seq_index.items(): seqs.append((len(v), k, v)) del seq_index[k] seqs.sort() for count, seq, seq_ids in seqs[::-1]: for seq_id in seq_ids: fasta_out_f.write('>%s\n%s\n' % (seq_id, seq))
def filter_fasta(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False, seqid_f=None): """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep input_seqs can be the output of parse_fasta or parse_fastq """ if seqid_f is None: seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0] for seq_id in seqs_to_keep]) # Define a function based on the value of negate if not negate: def keep_seq(seq_id): return seq_id.split()[0] in seqs_to_keep_lookup else: def keep_seq(seq_id): return seq_id.split()[0] not in seqs_to_keep_lookup else: if not negate: keep_seq = seqid_f else: keep_seq = lambda x: not seqid_f(x) for seq_id, seq in parse_fasta(input_seqs_f): if keep_seq(seq_id): output_seqs_f.write('>%s\n%s\n' % (seq_id, seq)) output_seqs_f.close()
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned sequences, will: 1. calculate a majority consensus (most common symbol at each position of the alignment); 2. compute the mean/std edit distance of each seq to the consensus; 3. discard sequences whose edit dist is greater than the cutoff, which is defined as being `num_stds` greater than the mean. """ # load the alignment and compute the consensus sequence aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA) consensus_seq = aln.majority_consensus() # compute the hamming distance between all sequences in the alignment # and the consensus sequence dists_to_consensus = [s.distance(consensus_seq) for s in aln] # compute the average and standard deviation distance from the consensus average_distance = mean(dists_to_consensus) std_distance = std(dists_to_consensus) # compute the distance cutoff dist_cutoff = average_distance + num_stds * std_distance # for all sequences, determine if they're distance to the consensus # is less then or equal to the cutoff distance. if so, add the sequence's # identifier to the list of sequence identifiers to keep seqs_to_keep = [] for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus): if dist_to_consensus <= dist_cutoff: seqs_to_keep.append(seq_id) # filter the alignment to only keep the sequences identified in the step # above filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep) # and return the filtered alignment return filtered_aln
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int( round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def truncate_rev_primers(fasta_f, output_fp, reverse_primers, truncate_option='truncate_only', primer_mismatches=2): """ Locally aligns reverse primers, trucates or removes seqs fasta_f: open file of fasta file output_fp: open filepath to write truncated fasta to reverse_primers: dictionary of SampleID:reverse primer sequence truncate_option: either truncate_only, truncate_remove primer_mismatches: number of allowed primer mismatches """ log_data = { 'sample_id_not_found': 0, 'reverse_primer_not_found': 0, 'total_seqs': 0, 'seqs_written': 0 } for label, seq in parse_fasta(fasta_f): curr_label = label.split('_')[0] log_data['total_seqs'] += 1 # Check fasta label for valid SampleID, if not found, just write seq try: curr_rev_primer = reverse_primers[curr_label] except KeyError: log_data['sample_id_not_found'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) log_data['seqs_written'] += 1 continue mm_tests = {} for rev_primer in curr_rev_primer: rev_primer_mm, rev_primer_index =\ local_align_primer_seq(rev_primer, seq) mm_tests[rev_primer_mm] = rev_primer_index rev_primer_mm = min(mm_tests.keys()) rev_primer_index = mm_tests[rev_primer_mm] if rev_primer_mm > primer_mismatches: if truncate_option == "truncate_remove": log_data['reverse_primer_not_found'] += 1 else: log_data['reverse_primer_not_found'] += 1 log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) else: # Check for zero seq length after truncation, will not write seq if rev_primer_index > 0: log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq[0:rev_primer_index])) return log_data
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table with open(aln_path) as aln_f: seqs = Alignment.from_fasta_records( parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA) # This ugly little line of code lets us pass a skbio Alignment when a # a cogent alignment is expected. seqs.getIntMap = seqs.int_map result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={ 'Algorithm': 'first', 'ChoiceF': first_id }) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) with open(self.infernal_test1_input_fp, "w") as in_f: in_f.write("\n".join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp(prefix="InfernalAlignerTests_", suffix="template.sto") close(fd) with open(self.infernal_test1_template_fp, "w") as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.log_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({"template_filepath": self.infernal_test1_template_fp}) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA )
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: # reading in the binary sff usually shows up as ValueError raise FileFormatError( 'Wrong flogram file format. Make sure you pass the sff.txt format ' + 'produced by sffinfo. The binary .sff will not work here.') (inverse_map, map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle)) filenames = [] # we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time # First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir + barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir + barcode_id) # Then direct each flowgram into its barcode file for f in flowgrams: if f.Name in inverse_map: barcode_id = inverse_map[f.Name] fh = open(outdir + barcode_id, "a") fh.write(f.createFlowHeader() + "\n") return filenames
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int(round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def filter_fasta(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False, seqid_f=None): """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep input_seqs can be the output of parse_fasta or parse_fastq """ if seqid_f is None: seqs_to_keep_lookup = {}.fromkeys( [seq_id.split()[0] for seq_id in seqs_to_keep]) # Define a function based on the value of negate if not negate: def keep_seq(seq_id): return seq_id.split()[0] in seqs_to_keep_lookup else: def keep_seq(seq_id): return seq_id.split()[0] not in seqs_to_keep_lookup else: if not negate: keep_seq = seqid_f else: keep_seq = lambda x: not seqid_f(x) for seq_id, seq in parse_fasta(input_seqs_f): if keep_seq(seq_id): output_seqs_f.write('>%s\n%s\n' % (seq_id, seq)) output_seqs_f.close()
def get_seqs_act_split_sequence_on_sample_ids(self, output_dir): """Parse output of split_sequence_file_on_sample_ids_to_files() Parameters ---------- output_dir: string output directory path storing FASTA files Returns ------- seqs_act: dict dictionary with keys being sample IDs and values list of sequences belonging to sample ID """ seqs_act = {} for fn in listdir(output_dir): input_fp = join(output_dir, fn) sample_file = splitext(fn)[0] with open(input_fp, 'U') as input_f: for label, seq in parse_fasta(input_f): sample = label.split('_')[0] self.assertEqual(sample_file, sample) if sample not in seqs_act: seqs_act[sample] = [(label, seq)] else: seqs_act[sample].append((label, seq)) return seqs_act
def _split_along_prefix(self, input_fp, params, jobs_to_start, job_prefix, output_dir): """ Split input sequences into sets with identical prefix""" out_files = [] buffered_handles = {} prefix_length = params['prefix_length'] or 1 for seq_id, seq in parse_fasta(open(input_fp)): if (len(seq) < prefix_length): raise ValueError( "Prefix length must be equal or longer than sequence.\n" + " Found seq %s with length %d" % (seq_id, len(seq))) prefix = seq[:prefix_length] if (prefix not in buffered_handles): # never seen this prefix before out_fp = "%s/%s%s" % (output_dir, job_prefix, prefix) buffered_handles[prefix] = BufferedWriter(out_fp) out_files.append(out_fp) self.prefix_counts[prefix] = 0 self.prefix_counts[prefix] += 1 buffered_handles[prefix].write('>%s\n%s\n' % (seq_id, seq)) # make sure all buffers are closed and flushed for buf_fh in buffered_handles.itervalues(): buf_fh.close() remove_files = True return out_files, remove_files
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc): """ """ for seq_id, seq in parse_fasta(fasta_lines): seq_id = seq_desc_mapper(seq_id) seq = str(DNA(seq.upper()).rc()) yield seq_id, seq return
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) split_fasta_on_sample_ids_to_files( parse_fasta(open(opts.input_fasta_fp, 'U')), opts.output_dir, opts.buffer_size)
def test_longest_id(self): """longest_id should return id associated with longest seq""" ids = \ "R27DLI_4812 R27DLI_600 R27DLI_727 U1PLI_403 U1PLI_8969".split( ) seqs = dict( parse_fasta(dna_seqs.splitlines(), label_to_name=label_to_name)) self.assertEqual(longest_id(ids, seqs), 'U1PLI_403')
def test_store_cluster(self): """store_clusters stores the centroid seqs for each cluster.""" self.tmpdir = get_tmp_filename(tmp_dir="./", suffix="_store_clusters/") create_dir(self.tmpdir) self.files_to_remove.append(self.tmpdir + "singletons.fasta") self.files_to_remove.append(self.tmpdir + "centroids.fasta") # empty map results in empty files store_clusters({}, self.tiny_test, self.tmpdir) actual_centroids = list( parse_fasta(open(self.tmpdir + "centroids.fasta"))) self.assertEqual(actual_centroids, []) actual_singletons = list( parse_fasta(open(self.tmpdir + "singletons.fasta"))) self.assertEqual(actual_singletons, []) # non-empty map creates non-empty files, centroids sorted by size mapping = { 'FZTHQMS01B8T1H': [], 'FZTHQMS01DE1KN': ['FZTHQMS01EHAJG'], 'FZTHQMS01EHAJG': [1, 2, 3] } # content doesn't really matter centroids = [ ('FZTHQMS01EHAJG | cluster size: 4', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA' ), ('FZTHQMS01DE1KN | cluster size: 2', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA' ) ] singletons = [( 'FZTHQMS01B8T1H', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAATTAAACCATGCGGTTTTATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCATCACTTA' )] store_clusters(mapping, self.tiny_test, self.tmpdir) actual_centroids = list( parse_fasta(open(self.tmpdir + "centroids.fasta"))) self.assertEqual(actual_centroids, centroids) actual_singletons = list( parse_fasta(open(self.tmpdir + "singletons.fasta"))) self.assertEqual(actual_singletons, singletons)