示例#1
0
 def test_build_index_sortmerna_fail(self):
     """Test functionality of build_index_sortmerna()
     """
     with self.assertRaises(RuntimeError):
         build_index_sortmerna(
             ref_fp='foo',
             working_dir=self.working_dir)
示例#2
0
    def test_launch_workflow_incorrect_trim(self):
        """
        test if we get the warning when trim length
        is too long
        """
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                          working_dir=self.working_dir)

        seqs_fp = self.seqs_s1_fp
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        # trim length longer than sequences
        trim_length = 151
        left_trim_length = 0
        threads = 1
        with self.assertWarns(UserWarning):
            launch_workflow(seqs_fp=seqs_fp,
                            working_dir=output_fp,
                            mean_error=mean_error,
                            error_dist=error_dist,
                            indel_prob=indel_prob,
                            indel_max=indel_max,
                            trim_length=trim_length,
                            left_trim_length=left_trim_length,
                            min_size=min_size,
                            ref_fp=(ref_fp, ),
                            ref_db_fp=ref_db_fp,
                            threads_per_sample=threads)
示例#3
0
    def test_launch_workflow_skip_trim(self):
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                          working_dir=self.working_dir)

        seqs_fp = self.seqs_s1_fp
        output_fp = self.working_dir
        mean_error = 0.005
        error_dist = get_default_error_profile()
        indel_prob = 0.01
        indel_max = 3
        min_size = 2
        # trim length longer than sequences
        trim_length = -1
        left_trim_length = 0
        threads = 1

        output_fp = launch_workflow(seqs_fp=seqs_fp,
                                    working_dir=output_fp,
                                    mean_error=mean_error,
                                    error_dist=error_dist,
                                    indel_prob=indel_prob,
                                    indel_max=indel_max,
                                    trim_length=trim_length,
                                    left_trim_length=left_trim_length,
                                    min_size=min_size,
                                    ref_fp=(ref_fp, ),
                                    ref_db_fp=ref_db_fp,
                                    threads_per_sample=threads)
        exp = Sequence.read(self.no_trim_res, format='fasta')
        res = Sequence.read(output_fp, format='fasta')
        self.assertEqual(exp, res)
示例#4
0
    def test_remove_artifacts_from_biom_table(self):
        """ Test remove_artifacts_from_biom_table() function for
        removing non 16s sequences from a biom table and matching
        fasta file. This test uses a pre-calculated biom table and
        fasta file and tests the output only 16s and only artifacts
        tables
        s4 dataset is similar to s2 but with two added non-16s
        sequences (which are not phix/adapter)
        """
        # create the positive reference databases
        pos_ref_fp = join(self.test_data_dir, '70_otus.fasta')
        pos_ref_db_fp = build_index_sortmerna(ref_fp=(pos_ref_fp, ),
                                              working_dir=self.working_dir)

        # remove the artifacts from the s4 biom table
        input_biom_file = join(self.test_data_dir, 'final.s4.biom')
        input_fasta_file = join(self.test_data_dir, 'final.s4.seqs.fa')
        remove_artifacts_from_biom_table(input_biom_file, input_fasta_file,
                                         [pos_ref_fp], self.working_dir,
                                         pos_ref_db_fp)

        origfilename = join(self.test_data_dir, 'simset.s2.fasta')
        trim_length = 150
        orig_seqs = [item[1] for item in sequence_generator(origfilename)]
        orig_seqs = [item[:trim_length].upper() for item in orig_seqs]

        no_artifacts_table_name = join(self.working_dir, 'final.only-16s.biom')
        no_artifacts_table = load_table(no_artifacts_table_name)
        obs_seqs = no_artifacts_table.ids(axis='observation')
        self.assertEqual(set(obs_seqs), set(orig_seqs))

        artifacts_table_name = join(self.working_dir, 'final.only-non16s.biom')
        artifacts_table = load_table(artifacts_table_name)
        obs_seqs = artifacts_table.ids(axis='observation')
        self.assertEqual(len(obs_seqs), 2)
示例#5
0
 def test_remove_artifacts_seqs(self):
     """ Test remove_artifacts_seqs() function for removing
         sequences not matching to a reference database
         using SortMeRNA. This test forces a new index
         construction for the reference sequences.
     """
     seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"),
             ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"),
             ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"),
             ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"),
             ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"),
             ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")]
     exp_seqs = ["seq1", "seq2", "seq3", "seq4", "seq5", "seq6"]
     seqs_fp = join(self.working_dir, "seqs.fasta")
     with open(seqs_fp, 'w') as seqs_f:
         for seq in seqs:
             seqs_f.write(">%s\n%s\n" % seq)
     ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
             "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
            ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref_fp = join(self.working_dir, "ref2.fasta")
     with open(ref_fp, 'w') as ref_f:
         for seq in ref:
             ref_f.write(">%s\n%s\n" % seq)
     self.files_to_remove.append(ref_fp)
     ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                       working_dir=self.working_dir)
     output_fp, num_seqs_left, tmp_files = remove_artifacts_seqs(
         seqs_fp=seqs_fp,
         ref_fp=(ref_fp, ),
         working_dir=self.working_dir,
         ref_db_fp=ref_db_fp,
         negate=False,
         threads=1)
     obs_seqs = []
     for label, seq in sequence_generator(output_fp):
         obs_seqs.append(label)
     self.assertEqual(obs_seqs, exp_seqs)
     # validate it creates one tmp file
     self.assertEqual(len(tmp_files), 1)
示例#6
0
 def test_remove_artifacts_seqs_negate(self):
     """ Test remove_artifacts_seqs() function for removing
         sequences matching to a reference database
         using SortMeRNA.
     """
     seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"),
             ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"),
             ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"),
             ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"),
             ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"),
             ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"),
             ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")]
     # seq5 is 80% similar, so should be kept for 0.95 default similarity
     # to artifacts
     exp_seqs = ["seq5", "phix1", "phix2", "phix3"]
     seqs_fp = join(self.working_dir, "seqs.fasta")
     with open(seqs_fp, 'w') as seqs_f:
         for seq in seqs:
             seqs_f.write(">%s\n%s\n" % seq)
     ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
             "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
            ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
            ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
             "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref_fp = join(self.working_dir, "ref4.fasta")
     with open(ref_fp, 'w') as ref_f:
         for seq in ref:
             ref_f.write(">%s\n%s\n" % seq)
     self.files_to_remove.append(ref_fp)
     ref_db_fp = build_index_sortmerna([ref_fp], self.working_dir)
     output_fp = join(self.working_dir, "seqs_filtered.fasta")
     output_fp, num_seqs_left, _ = remove_artifacts_seqs(
         seqs_fp=seqs_fp,
         ref_fp=(ref_fp, ),
         working_dir=self.working_dir,
         ref_db_fp=ref_db_fp,
         negate=True,
         threads=1)
     obs_seqs = []
     for label, seq in sequence_generator(output_fp):
         obs_seqs.append(label)
     self.assertEqual(obs_seqs, exp_seqs)
示例#7
0
 def test_build_index_sortmerna(self):
     """Test functionality of build_index_sortmerna()
     """
     ref1 = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
              "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
             ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref2 = [("ref1", "GTCGTAGCTAGCTGCCCACGATCGTAGCTAGCTAGCTACGTAGCTCATCAC"
              "TCGCCGACCCACGTCCCACTGATGCTGTGGG"),
             ("ref2", "GCGGCGCCCAAAAATGTCGTGTAAAATTTTCTCGTACCCACTTGCTACCCA"
              "TGGCCGCCATGCTGCTAACGCAATATATATA"),
             ("ref3", "TGTGAAAGCGCGCGAGAGAGTCGTATATATGGGCGCGGCGCGATGCTGCCC"
              "GTCGATGCTGATCCCCCACGTACGTAGCCCC"),
             ("ref4", "GTGTGCTCGCGTAGCTAGCTTATATATCGGCGCGTAGTGCTAGCCCCAAAA"
              "GTGTCCCCCCCCTCCTTTTTTATATATGCAA"),
             ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
             ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref1_fp = join(self.working_dir, "ref1.fasta")
     with open(ref1_fp, 'w') as ref_f:
         for seq in ref1:
             ref_f.write(">%s\n%s\n" % seq)
     ref2_fp = join(self.working_dir, "ref2.fasta")
     with open(ref2_fp, 'w') as ref_f:
         for seq in ref2:
             ref_f.write(">%s\n%s\n" % seq)
     ref_fps = tuple([ref1_fp, ref2_fp])
     ref_db_fp, files_to_remove = build_index_sortmerna(
         ref_fp=ref_fps, working_dir=self.working_dir)
     files_to_remove_exp = [
         'ref1.stats', 'ref1.pos_0.dat', 'ref1.kmer_0.dat',
         'ref1.bursttrie_0.dat', 'ref2.stats', 'ref2.pos_0.dat',
         'ref2.kmer_0.dat', 'ref2.bursttrie_0.dat'
     ]
     files_to_remove_act = [basename(f) for f in files_to_remove]
     self.assertListEqual(files_to_remove_exp, files_to_remove_act)
示例#8
0
    def test_launch_workflow(self):
        """Test launching complete workflow using 3 simulated sequence files.
        seqs1 - 100 reads using art, original sequences are >0.5 identical.
        seqs2 - 200 reads using grinder, original sequences are >0.9 identical,
        0.1 chimeras, 35 phix reads
        seqs3 - simple - 15 reads from seqs1 (10 reads for 1001203,
        5 reads for 694276) for manual test validation
        """
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp, files_to_remove = build_index_sortmerna(
            ref_fp=(ref_fp, ), working_dir=self.working_dir)

        self.run_workflow_try(self.seqs_s1_fp, self.orig_s1_fp, ref_fp,
                              ref_db_fp)
        self.run_workflow_try(self.seqs_s2_fp, self.orig_s2_fp, ref_fp,
                              ref_db_fp)
        self.run_workflow_try(self.seqs_s3_fp, self.orig_s3_fp, ref_fp,
                              ref_db_fp)
示例#9
0
    def test_parallel_deblur(self):
        """Test parallel deblur using 3 simulated sequence files.
        seqs1 - 100 reads using art, original sequences are >0.5 identical.
        seqs2 - 200 reads using grinder, original sequences are >0.9 identical,
        0.1 chimeras, 35 phix reads
        seqs3 - simple - 15 reads from seqs1 (10 reads for 1001203,
        5 reads for 694276) for manual test validation
        """
        # index the 70% rep. set database
        ref_fp = join(self.test_data_dir, '70_otus.fasta')
        ref_db_fp = build_index_sortmerna(ref_fp=(ref_fp, ),
                                          working_dir=self.working_dir)

        trim_length = 100
        params = [
            'deblur', 'workflow', '--seqs-fp', 'ignorethis', '--output-dir',
            self.working_dir, '--pos-ref-fp', ref_fp, '-d',
            '1,0.06,0.02,0.02,0.01,0.005,0.005,0.005,0.001'
            ',0.001,0.001,0.0005', '-t',
            str(trim_length)
        ]
        parallel_deblur([self.seqs_s1_fp, self.seqs_s2_fp, self.seqs_s3_fp],
                        params,
                        ref_db_fp,
                        None,
                        jobs_to_start=2)

        deblur_working_dir = join(self.working_dir, "deblur_working_dir")

        deb1res = join(
            deblur_working_dir, 'seqs_s1.fasta.trim.derep.no_artifacts'
            '.msa.deblur.no_chimeras')
        self.compare_result(deb1res, self.orig_s1_fp, trim_length=trim_length)

        deb2res = join(
            deblur_working_dir, 'seqs_s2.fasta.trim.derep.no_artifacts'
            '.msa.deblur.no_chimeras')
        self.compare_result(deb2res, self.orig_s2_fp, trim_length=trim_length)

        deb3res = join(
            deblur_working_dir, 'seqs_s3.fasta.trim.derep.no_artifacts'
            '.msa.deblur.no_chimeras')
        self.compare_result(deb3res, self.orig_s3_fp, trim_length=trim_length)
示例#10
0
 def test_build_index_sortmerna(self):
     """Test functionality of build_index_sortmerna()
     """
     ref1 = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA"
              "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"),
             ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
             ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref2 = [("ref1", "GTCGTAGCTAGCTGCCCACGATCGTAGCTAGCTAGCTACGTAGCTCATCAC"
              "TCGCCGACCCACGTCCCACTGATGCTGTGGG"),
             ("ref2", "GCGGCGCCCAAAAATGTCGTGTAAAATTTTCTCGTACCCACTTGCTACCCA"
              "TGGCCGCCATGCTGCTAACGCAATATATATA"),
             ("ref3", "TGTGAAAGCGCGCGAGAGAGTCGTATATATGGGCGCGGCGCGATGCTGCCC"
              "GTCGATGCTGATCCCCCACGTACGTAGCCCC"),
             ("ref4", "GTGTGCTCGCGTAGCTAGCTTATATATCGGCGCGTAGTGCTAGCCCCAAAA"
              "GTGTCCCCCCCCTCCTTTTTTATATATGCAA"),
             ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"),
             ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG"
              "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")]
     ref1_fp = join(self.working_dir, "ref1.fasta")
     with open(ref1_fp, 'w') as ref_f:
         for seq in ref1:
             ref_f.write(">%s\n%s\n" % seq)
     ref2_fp = join(self.working_dir, "ref2.fasta")
     with open(ref2_fp, 'w') as ref_f:
         for seq in ref2:
             ref_f.write(">%s\n%s\n" % seq)
     ref_fps = tuple([ref1_fp, ref2_fp])
     ref_db_fp = build_index_sortmerna(
         ref_fp=ref_fps,
         working_dir=self.working_dir)
     self.assertEqual(len(ref_fps), len(ref_db_fp))