def test_build_index_sortmerna_fail(self): """Test functionality of build_index_sortmerna() """ with self.assertRaises(RuntimeError): build_index_sortmerna( ref_fp='foo', working_dir=self.working_dir)
def test_launch_workflow_incorrect_trim(self): """ test if we get the warning when trim length is too long """ # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) seqs_fp = self.seqs_s1_fp output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 # trim length longer than sequences trim_length = 151 left_trim_length = 0 threads = 1 with self.assertWarns(UserWarning): launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, left_trim_length=left_trim_length, min_size=min_size, ref_fp=(ref_fp, ), ref_db_fp=ref_db_fp, threads_per_sample=threads)
def test_launch_workflow_skip_trim(self): # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) seqs_fp = self.seqs_s1_fp output_fp = self.working_dir mean_error = 0.005 error_dist = get_default_error_profile() indel_prob = 0.01 indel_max = 3 min_size = 2 # trim length longer than sequences trim_length = -1 left_trim_length = 0 threads = 1 output_fp = launch_workflow(seqs_fp=seqs_fp, working_dir=output_fp, mean_error=mean_error, error_dist=error_dist, indel_prob=indel_prob, indel_max=indel_max, trim_length=trim_length, left_trim_length=left_trim_length, min_size=min_size, ref_fp=(ref_fp, ), ref_db_fp=ref_db_fp, threads_per_sample=threads) exp = Sequence.read(self.no_trim_res, format='fasta') res = Sequence.read(output_fp, format='fasta') self.assertEqual(exp, res)
def test_remove_artifacts_from_biom_table(self): """ Test remove_artifacts_from_biom_table() function for removing non 16s sequences from a biom table and matching fasta file. This test uses a pre-calculated biom table and fasta file and tests the output only 16s and only artifacts tables s4 dataset is similar to s2 but with two added non-16s sequences (which are not phix/adapter) """ # create the positive reference databases pos_ref_fp = join(self.test_data_dir, '70_otus.fasta') pos_ref_db_fp = build_index_sortmerna(ref_fp=(pos_ref_fp, ), working_dir=self.working_dir) # remove the artifacts from the s4 biom table input_biom_file = join(self.test_data_dir, 'final.s4.biom') input_fasta_file = join(self.test_data_dir, 'final.s4.seqs.fa') remove_artifacts_from_biom_table(input_biom_file, input_fasta_file, [pos_ref_fp], self.working_dir, pos_ref_db_fp) origfilename = join(self.test_data_dir, 'simset.s2.fasta') trim_length = 150 orig_seqs = [item[1] for item in sequence_generator(origfilename)] orig_seqs = [item[:trim_length].upper() for item in orig_seqs] no_artifacts_table_name = join(self.working_dir, 'final.only-16s.biom') no_artifacts_table = load_table(no_artifacts_table_name) obs_seqs = no_artifacts_table.ids(axis='observation') self.assertEqual(set(obs_seqs), set(orig_seqs)) artifacts_table_name = join(self.working_dir, 'final.only-non16s.biom') artifacts_table = load_table(artifacts_table_name) obs_seqs = artifacts_table.ids(axis='observation') self.assertEqual(len(obs_seqs), 2)
def test_remove_artifacts_seqs(self): """ Test remove_artifacts_seqs() function for removing sequences not matching to a reference database using SortMeRNA. This test forces a new index construction for the reference sequences. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] exp_seqs = ["seq1", "seq2", "seq3", "seq4", "seq5", "seq6"] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref2.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) output_fp, num_seqs_left, tmp_files = remove_artifacts_seqs( seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=ref_db_fp, negate=False, threads=1) obs_seqs = [] for label, seq in sequence_generator(output_fp): obs_seqs.append(label) self.assertEqual(obs_seqs, exp_seqs) # validate it creates one tmp file self.assertEqual(len(tmp_files), 1)
def test_remove_artifacts_seqs_negate(self): """ Test remove_artifacts_seqs() function for removing sequences matching to a reference database using SortMeRNA. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] # seq5 is 80% similar, so should be kept for 0.95 default similarity # to artifacts exp_seqs = ["seq5", "phix1", "phix2", "phix3"] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref4.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) ref_db_fp = build_index_sortmerna([ref_fp], self.working_dir) output_fp = join(self.working_dir, "seqs_filtered.fasta") output_fp, num_seqs_left, _ = remove_artifacts_seqs( seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=ref_db_fp, negate=True, threads=1) obs_seqs = [] for label, seq in sequence_generator(output_fp): obs_seqs.append(label) self.assertEqual(obs_seqs, exp_seqs)
def test_build_index_sortmerna(self): """Test functionality of build_index_sortmerna() """ ref1 = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref2 = [("ref1", "GTCGTAGCTAGCTGCCCACGATCGTAGCTAGCTAGCTACGTAGCTCATCAC" "TCGCCGACCCACGTCCCACTGATGCTGTGGG"), ("ref2", "GCGGCGCCCAAAAATGTCGTGTAAAATTTTCTCGTACCCACTTGCTACCCA" "TGGCCGCCATGCTGCTAACGCAATATATATA"), ("ref3", "TGTGAAAGCGCGCGAGAGAGTCGTATATATGGGCGCGGCGCGATGCTGCCC" "GTCGATGCTGATCCCCCACGTACGTAGCCCC"), ("ref4", "GTGTGCTCGCGTAGCTAGCTTATATATCGGCGCGTAGTGCTAGCCCCAAAA" "GTGTCCCCCCCCTCCTTTTTTATATATGCAA"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref1_fp = join(self.working_dir, "ref1.fasta") with open(ref1_fp, 'w') as ref_f: for seq in ref1: ref_f.write(">%s\n%s\n" % seq) ref2_fp = join(self.working_dir, "ref2.fasta") with open(ref2_fp, 'w') as ref_f: for seq in ref2: ref_f.write(">%s\n%s\n" % seq) ref_fps = tuple([ref1_fp, ref2_fp]) ref_db_fp, files_to_remove = build_index_sortmerna( ref_fp=ref_fps, working_dir=self.working_dir) files_to_remove_exp = [ 'ref1.stats', 'ref1.pos_0.dat', 'ref1.kmer_0.dat', 'ref1.bursttrie_0.dat', 'ref2.stats', 'ref2.pos_0.dat', 'ref2.kmer_0.dat', 'ref2.bursttrie_0.dat' ] files_to_remove_act = [basename(f) for f in files_to_remove] self.assertListEqual(files_to_remove_exp, files_to_remove_act)
def test_launch_workflow(self): """Test launching complete workflow using 3 simulated sequence files. seqs1 - 100 reads using art, original sequences are >0.5 identical. seqs2 - 200 reads using grinder, original sequences are >0.9 identical, 0.1 chimeras, 35 phix reads seqs3 - simple - 15 reads from seqs1 (10 reads for 1001203, 5 reads for 694276) for manual test validation """ # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp, files_to_remove = build_index_sortmerna( ref_fp=(ref_fp, ), working_dir=self.working_dir) self.run_workflow_try(self.seqs_s1_fp, self.orig_s1_fp, ref_fp, ref_db_fp) self.run_workflow_try(self.seqs_s2_fp, self.orig_s2_fp, ref_fp, ref_db_fp) self.run_workflow_try(self.seqs_s3_fp, self.orig_s3_fp, ref_fp, ref_db_fp)
def test_parallel_deblur(self): """Test parallel deblur using 3 simulated sequence files. seqs1 - 100 reads using art, original sequences are >0.5 identical. seqs2 - 200 reads using grinder, original sequences are >0.9 identical, 0.1 chimeras, 35 phix reads seqs3 - simple - 15 reads from seqs1 (10 reads for 1001203, 5 reads for 694276) for manual test validation """ # index the 70% rep. set database ref_fp = join(self.test_data_dir, '70_otus.fasta') ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ), working_dir=self.working_dir) trim_length = 100 params = [ 'deblur', 'workflow', '--seqs-fp', 'ignorethis', '--output-dir', self.working_dir, '--pos-ref-fp', ref_fp, '-d', '1,0.06,0.02,0.02,0.01,0.005,0.005,0.005,0.001' ',0.001,0.001,0.0005', '-t', str(trim_length) ] parallel_deblur([self.seqs_s1_fp, self.seqs_s2_fp, self.seqs_s3_fp], params, ref_db_fp, None, jobs_to_start=2) deblur_working_dir = join(self.working_dir, "deblur_working_dir") deb1res = join( deblur_working_dir, 'seqs_s1.fasta.trim.derep.no_artifacts' '.msa.deblur.no_chimeras') self.compare_result(deb1res, self.orig_s1_fp, trim_length=trim_length) deb2res = join( deblur_working_dir, 'seqs_s2.fasta.trim.derep.no_artifacts' '.msa.deblur.no_chimeras') self.compare_result(deb2res, self.orig_s2_fp, trim_length=trim_length) deb3res = join( deblur_working_dir, 'seqs_s3.fasta.trim.derep.no_artifacts' '.msa.deblur.no_chimeras') self.compare_result(deb3res, self.orig_s3_fp, trim_length=trim_length)
def test_build_index_sortmerna(self): """Test functionality of build_index_sortmerna() """ ref1 = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref2 = [("ref1", "GTCGTAGCTAGCTGCCCACGATCGTAGCTAGCTAGCTACGTAGCTCATCAC" "TCGCCGACCCACGTCCCACTGATGCTGTGGG"), ("ref2", "GCGGCGCCCAAAAATGTCGTGTAAAATTTTCTCGTACCCACTTGCTACCCA" "TGGCCGCCATGCTGCTAACGCAATATATATA"), ("ref3", "TGTGAAAGCGCGCGAGAGAGTCGTATATATGGGCGCGGCGCGATGCTGCCC" "GTCGATGCTGATCCCCCACGTACGTAGCCCC"), ("ref4", "GTGTGCTCGCGTAGCTAGCTTATATATCGGCGCGTAGTGCTAGCCCCAAAA" "GTGTCCCCCCCCTCCTTTTTTATATATGCAA"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref1_fp = join(self.working_dir, "ref1.fasta") with open(ref1_fp, 'w') as ref_f: for seq in ref1: ref_f.write(">%s\n%s\n" % seq) ref2_fp = join(self.working_dir, "ref2.fasta") with open(ref2_fp, 'w') as ref_f: for seq in ref2: ref_f.write(">%s\n%s\n" % seq) ref_fps = tuple([ref1_fp, ref2_fp]) ref_db_fp = build_index_sortmerna( ref_fp=ref_fps, working_dir=self.working_dir) self.assertEqual(len(ref_fps), len(ref_db_fp))