Exemplo n.º 1
0
    def test_get_clusters_from_fasta_filepath_suppress_sort(self):
        """ Test OTUs from filepath functions with suppress sort
        """
        expected = [
            ["uclust_test_seqs_0"],
            ["uclust_test_seqs_1"],
            ["uclust_test_seqs_2"],
            ["uclust_test_seqs_3"],
            ["uclust_test_seqs_4"],
            ["uclust_test_seqs_5"],
            ["uclust_test_seqs_6", "uclust_test_seqs_8"],
            ["uclust_test_seqs_7"],
            ["uclust_test_seqs_9"],
        ]
        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_unsorted_fasta_filepath,
            original_fasta_path=None,
            percent_ID=0.90,
            suppress_sort=True,
            save_uc_files=False,
        )
        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()

        self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
Exemplo n.º 2
0
    def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
        """ Test OTUs from filepath functions with rev strand match
        """
        # seq and its rc don't cluster when enable_rev_strand_matching = False
        expected_cluster_list = [["uclust_test_seqs_0"], ["uclust_test_seqs_0_rc"]]
        expected_failure_list = []
        expected_new_seed_list = ["uclust_test_seqs_0", "uclust_test_seqs_0_rc"]
        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_raw_dna_seqs_rc_filepath,
            original_fasta_path=None,
            save_uc_files=False,
            percent_ID=0.90,
            enable_rev_strand_matching=False,
        )

        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))

        # seq and its rc cluster when enable_rev_strand_matching = False
        expected_cluster_list = [["uclust_test_seqs_0", "uclust_test_seqs_0_rc"]]
        expected_failure_list = []
        expected_new_seed_list = ["uclust_test_seqs_0"]
        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_raw_dna_seqs_rc_filepath,
            original_fasta_path=None,
            save_uc_files=False,
            percent_ID=0.90,
            enable_rev_strand_matching=True,
        )

        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
Exemplo n.º 3
0
 def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
     """ Test OTUs from filepath functions with rev strand match
     """
     # seq and its rc don't cluster when enable_rev_strand_matching = False
     expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']]
     expected_failure_list = []
     expected_new_seed_list = ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']
     clusters_res = \
      get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
       original_fasta_path = None, save_uc_files=False,
       percent_ID = 0.90, enable_rev_strand_matching = False)
     
     expected_cluster_list.sort()
     expected_failure_list.sort()
     expected_new_seed_list.sort()
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     self.assertEqual(clusters_res,(expected_cluster_list,
                                    expected_failure_list,
                                    expected_new_seed_list))
     
     # seq and its rc cluster when enable_rev_strand_matching = False
     expected_cluster_list = [['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
     expected_failure_list = []
     expected_new_seed_list = ['uclust_test_seqs_0']
     clusters_res = \
      get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
       original_fasta_path = None, save_uc_files=False,
       percent_ID = 0.90, enable_rev_strand_matching = True)
     
     expected_cluster_list.sort()
     expected_failure_list.sort()
     expected_new_seed_list.sort()
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     self.assertEqual(clusters_res,(expected_cluster_list,
                                    expected_failure_list,
                                    expected_new_seed_list))
Exemplo n.º 4
0
    def test_get_clusters_from_fasta_filepath(self):
        """ Tests for return of lists of OTUs from given fasta filepath """

        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, save_uc_files=False
        )
        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
Exemplo n.º 5
0
 def test_get_clusters_from_fasta_filepath(self):
     """ Tests for return of lists of OTUs from given fasta filepath """
     
     clusters_res = \
      get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, \
       original_fasta_path = None, percent_ID = 0.90, save_uc_files=False)
     expected_cluster_list.sort()
     expected_failure_list.sort()
     expected_new_seed_list.sort()
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     self.assertEqual(clusters_res,(expected_cluster_list,
                                    expected_failure_list,
                                    expected_new_seed_list))
Exemplo n.º 6
0
 def test_get_clusters_from_fasta_filepath_optimal(self):
     """ Test OTUs from filepath functions with optimal
     """
     # need to compile a small test where optimal has an affect --
     # this currently is only testing that we don't get a failure with
     # optimal
     clusters_res = \
      get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
       original_fasta_path = None, save_uc_files=False,
       percent_ID = 0.90, optimal = True)
     expected_cluster_list.sort()
     expected_failure_list.sort()
     expected_new_seed_list.sort()
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     
     self.assertEqual(clusters_res,(expected_cluster_list,
                                    expected_failure_list,
                                    expected_new_seed_list))
Exemplo n.º 7
0
 def test_get_clusters_from_fasta_filepath_suppress_sort(self):
     """ Test OTUs from filepath functions with suppress sort
     """
     expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
                 ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
                 ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
                 ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
                 ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
     clusters_res = \
      get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
       original_fasta_path = None,
       percent_ID = 0.90, suppress_sort = True, save_uc_files=False)
     expected_cluster_list.sort()
     expected_failure_list.sort()
     expected_new_seed_list.sort()
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     
     self.assertEqual(clusters_res,(expected_cluster_list,
                                    expected_failure_list,
                                    expected_new_seed_list))
Exemplo n.º 8
0
 def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
     """ Correct clusters when clustering against db and adding new clusters
     """
     clusters_res = get_clusters_from_fasta_filepath(
       self.tmp_unsorted_fasta_filepath,
       original_fasta_path = None,
       max_accepts=7,max_rejects=12,
       percent_ID = 0.90,
       subject_fasta_filepath=self.ref_dna_seqs_fp,
       suppress_new_clusters=False,enable_rev_strand_matching=True,
       HALT_EXEC=False,
       save_uc_files=False)
     
     self.ref_test_clusters2.sort()
     self.ref_test_failures2.sort()
     self.ref_test_new_seeds2.sort()
     
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     self.assertEqual(clusters_res,(self.ref_test_clusters2,
                                    self.ref_test_failures2,
                                    self.ref_test_new_seeds2))
Exemplo n.º 9
0
 def test_get_clusters_from_fasta_filepath_reference_db_only(self):
     """ Correct clusters returned when clustering against a database only 
     """
     clusters_res = get_clusters_from_fasta_filepath(
       self.tmp_unsorted_fasta_filepath,
       original_fasta_path = None, 
       save_uc_files=False,
       max_accepts=7,max_rejects=12,
       percent_ID = 0.90,
       subject_fasta_filepath=self.ref_dna_seqs_fp,
       suppress_new_clusters=True,
       HALT_EXEC=False)
     
     self.ref_test_clusters1.sort()
     self.ref_test_failures1.sort()
     self.ref_test_new_seeds1.sort()
     
     clusters_res[0].sort()
     clusters_res[1].sort()
     clusters_res[2].sort()
     self.assertEqual(clusters_res,(self.ref_test_clusters1,
                                    self.ref_test_failures1,
                                    self.ref_test_new_seeds1))
Exemplo n.º 10
0
    def __call__(self,
                 seq_fp,
                 refseqs_fp,
                 next_new_cluster_number=None,
                 new_cluster_identifier=None,
                 result_path=None,
                 log_path=None,
                 failure_path=None,
                 HALT_EXEC=False):

        original_fasta_path = seq_fp

        if new_cluster_identifier:
            self.Params['new_cluster_identifier'] = new_cluster_identifier
        if next_new_cluster_number != None:
            self.Params['next_new_cluster_number'] = next_new_cluster_number

        if self.Params['presort_by_abundance']:
            # seq path will become the temporary sorted sequences
            # filepath, to be cleaned up after the run
            seq_fp = self._presort_by_abundance(seq_fp)
            files_to_remove = [seq_fp]
        else:
            # create a dummy list of files to clean up
            files_to_remove = []

        # perform the clustering
        cluster_map, failures, new_seeds = get_clusters_from_fasta_filepath(
            seq_fp,
            original_fasta_path,
            subject_fasta_filepath=refseqs_fp,
            percent_ID=self.Params['Similarity'],
            enable_rev_strand_matching=self.
            Params['enable_rev_strand_matching'],
            max_accepts=self.Params['max_accepts'],
            max_rejects=self.Params['max_rejects'],
            suppress_new_clusters=self.Params['suppress_new_clusters'],
            optimal=self.Params['optimal'],
            exact=self.Params['exact'],
            suppress_sort=self.Params['suppress_sort'],
            return_cluster_maps=True,
            stable_sort=self.Params['stable_sort'],
            save_uc_files=self.Params['save_uc_files'],
            output_dir=self.Params['output_dir'],
            HALT_EXEC=HALT_EXEC)

        self._rename_clusters(cluster_map, new_seeds)

        # clean up any temp files that were created
        remove_files(files_to_remove)

        log_lines = []
        log_lines.append('Reference seqs:%s' % refseqs_fp)
        log_lines.append('Num OTUs:%d' % len(cluster_map))
        log_lines.append('Num new OTUs:%d' % len(new_seeds))
        log_lines.append('Num failures:%d' % len(failures))

        cluster_map = cluster_map.items()
        result = self._prepare_results(result_path, cluster_map, log_lines)

        if log_path:
            self._write_log(log_path, log_lines)

        if failure_path:
            self._write_failures(failure_path, failures)

        # return the result (note this is None if the data was
        # written to file)
        return result
Exemplo n.º 11
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 HALT_EXEC=False):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.

        """

        original_fasta_path = seq_path

        if self.Params['presort_by_abundance']:
            # seq path will become the temporary sorted sequences
            # filepath, to be cleaned up after the run
            seq_path = self._presort_by_abundance(seq_path)
            files_to_remove = [seq_path]
        else:
            # create a dummy list of files to clean up
            files_to_remove = []

        # perform the clustering
        clusters, failures, seeds = get_clusters_from_fasta_filepath(
            seq_path,
            original_fasta_path,
            percent_ID=self.Params['Similarity'],
            optimal=self.Params['optimal'],
            exact=self.Params['exact'],
            suppress_sort=self.Params['suppress_sort'],
            enable_rev_strand_matching=self.
            Params['enable_rev_strand_matching'],
            max_accepts=self.Params['max_accepts'],
            max_rejects=self.Params['max_rejects'],
            stable_sort=self.Params['stable_sort'],
            save_uc_files=self.Params['save_uc_files'],
            output_dir=self.Params['output_dir'],
            HALT_EXEC=HALT_EXEC)

        # clean up any temp files that were created
        remove_files(files_to_remove)

        log_lines = []
        log_lines.append('Num OTUs:%d' % len(clusters))

        otu_id_prefix = self.Params['new_cluster_identifier']
        if otu_id_prefix == None:
            clusters = enumerate(clusters)
        else:
            clusters = [('%s%d' % (otu_id_prefix, i), c)
                        for i, c in enumerate(clusters)]
        result = self._prepare_results(result_path, clusters, log_lines)

        if log_path:
            self._write_log(log_path, log_lines)

        # return the result (note this is None if the data was
        # written to file)
        return result