예제 #1
0
    def test_bioenv_nonnumeric_columns(self):
        df = self.df.replace(2400, 'no cog yay')
        with self.assertRaises(TypeError):
            bioenv(self.dm, df)

        with self.assertRaises(TypeError):
            bioenv(self.dm, self.df_extra_column)
예제 #2
0
    def test_bioenv_nonnumeric_columns(self):
        df = self.df.replace(2400, 'no cog yay')
        with self.assertRaises(TypeError):
            bioenv(self.dm, df)

        with self.assertRaises(TypeError):
            bioenv(self.dm, self.df_extra_column)
예제 #3
0
    def test_bioenv_all_columns_implicit(self):
        # Test with all columns in data frame (implicitly).
        obs = bioenv(self.dm, self.df)
        assert_frame_equal(obs, self.exp_results)

        # Should get the same results if order of rows/cols in distance matrix
        # is changed.
        obs = bioenv(self.dm_reordered, self.df)
        assert_frame_equal(obs, self.exp_results)
예제 #4
0
    def test_bioenv_all_columns_implicit(self):
        # Test with all columns in data frame (implicitly).
        obs = bioenv(self.dm, self.df)
        assert_frame_equal(obs, self.exp_results)

        # Should get the same results if order of rows/cols in distance matrix
        # is changed.
        obs = bioenv(self.dm_reordered, self.df)
        assert_frame_equal(obs, self.exp_results)
예제 #5
0
    def test_bioenv_all_columns_explicit(self):
        # Test with all columns being specified.
        obs = bioenv(self.dm, self.df, columns=self.cols)
        assert_frame_equal(obs, self.exp_results)

        # Test against a data frame that has an extra non-numeric column and
        # some of the rows and columns reordered (we should get the same
        # result since we're specifying the same columns in the same order).
        obs = bioenv(self.dm, self.df_extra_column, columns=self.cols)
        assert_frame_equal(obs, self.exp_results)
예제 #6
0
    def test_bioenv_all_columns_explicit(self):
        # Test with all columns being specified.
        obs = bioenv(self.dm, self.df, columns=self.cols)
        assert_frame_equal(obs, self.exp_results)

        # Test against a data frame that has an extra non-numeric column and
        # some of the rows and columns reordered (we should get the same
        # result since we're specifying the same columns in the same order).
        obs = bioenv(self.dm, self.df_extra_column, columns=self.cols)
        assert_frame_equal(obs, self.exp_results)
예제 #7
0
    def test_bioenv_no_side_effects(self):
        # Deep copies of both primary inputs.
        dm_copy = self.dm.copy()
        df_copy = self.df.copy(deep=True)

        bioenv(self.dm, self.df)

        # Make sure we haven't modified the primary input in some way (e.g.,
        # with scaling, type conversions, etc.).
        self.assertEqual(self.dm, dm_copy)
        assert_frame_equal(self.df, df_copy)
예제 #8
0
    def test_bioenv_no_side_effects(self):
        # Deep copies of both primary inputs.
        dm_copy = self.dm.copy()
        df_copy = self.df.copy(deep=True)

        bioenv(self.dm, self.df)

        # Make sure we haven't modified the primary input in some way (e.g.,
        # with scaling, type conversions, etc.).
        self.assertEqual(self.dm, dm_copy)
        assert_frame_equal(self.df, df_copy)
예제 #9
0
 def test_bioenv_vegan_example(self):
     # The correlation coefficient in the first row of the
     # results (rho=0.2516) is different from the correlation coefficient
     # computed by vegan (rho=0.2513). This seems to occur due to
     # differences in numerical precision when calculating the Euclidean
     # distances, which affects the rank calculations in Spearman
     # (specifically, dealing with ties). The ranked distances end up being
     # slightly different between vegan and our implementation because some
     # distances are treated as ties in vegan but treated as distinct values
     # in our implementation. This explains the difference in rho values. I
     # verified that using Pearson correlation instead of Spearman on the
     # same distances yields *very* similar results. Thus, the discrepancy
     # seems to stem from differences when computing ranks/ties.
     obs = bioenv(self.dm_vegan, self.df_vegan)
     assert_frame_equal(obs, self.exp_results_vegan)
예제 #10
0
 def test_bioenv_vegan_example(self):
     # The correlation coefficient in the first row of the
     # results (rho=0.2516) is different from the correlation coefficient
     # computed by vegan (rho=0.2513). This seems to occur due to
     # differences in numerical precision when calculating the Euclidean
     # distances, which affects the rank calculations in Spearman
     # (specifically, dealing with ties). The ranked distances end up being
     # slightly different between vegan and our implementation because some
     # distances are treated as ties in vegan but treated as distinct values
     # in our implementation. This explains the difference in rho values. I
     # verified that using Pearson correlation instead of Spearman on the
     # same distances yields *very* similar results. Thus, the discrepancy
     # seems to stem from differences when computing ranks/ties.
     obs = bioenv(self.dm_vegan, self.df_vegan)
     assert_frame_equal(obs, self.exp_results_vegan)
예제 #11
0
 def test_bioenv_no_distance_matrix(self):
     with self.assertRaises(TypeError):
         bioenv('breh', self.df)
예제 #12
0
 def test_bioenv_no_data_frame(self):
     with self.assertRaises(TypeError):
         bioenv(self.dm, None)
예제 #13
0
 def test_bioenv_different_column_order(self):
     # Specifying columns in a different order will change the row labels in
     # the results data frame as the column subsets will be reordered, but
     # the actual results (e.g., correlation coefficients) shouldn't change.
     obs = bioenv(self.dm, self.df, columns=self.cols[::-1])
     assert_frame_equal(obs, self.exp_results_different_column_order)
예제 #14
0
 def test_bioenv_missing_distance_matrix_ids(self):
     df = self.df[1:]
     with self.assertRaises(ValueError):
         bioenv(self.dm, df)
예제 #15
0
 def test_bioenv_no_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=[])
예제 #16
0
 def test_bioenv_nans(self):
     df = self.df.replace(53.9, np.nan)
     with self.assertRaises(ValueError):
         bioenv(self.dm, df)
예제 #17
0
 def test_bioenv_missing_distance_matrix_ids(self):
     df = self.df[1:]
     with self.assertRaises(ValueError):
         bioenv(self.dm, df)
예제 #18
0
 def test_bioenv_missing_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=self.cols + ['brofist'])
예제 #19
0
 def test_bioenv_no_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=[])
예제 #20
0
 def test_bioenv_duplicate_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=self.cols + ['PH'])
예제 #21
0
 def test_bioenv_no_data_frame(self):
     with self.assertRaises(TypeError):
         bioenv(self.dm, None)
예제 #22
0
 def test_bioenv_duplicate_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=self.cols + ['PH'])
예제 #23
0
 def test_bioenv_single_column(self):
     obs = bioenv(self.dm, self.df, columns=['PH'])
     assert_frame_equal(obs, self.exp_results_single_column)
예제 #24
0
 def test_bioenv_missing_columns(self):
     with self.assertRaises(ValueError):
         bioenv(self.dm, self.df, columns=self.cols + ['brofist'])
예제 #25
0
 def test_bioenv_single_column(self):
     obs = bioenv(self.dm, self.df, columns=['PH'])
     assert_frame_equal(obs, self.exp_results_single_column)
예제 #26
0
 def test_bioenv_nans(self):
     df = self.df.replace(53.9, np.nan)
     with self.assertRaises(ValueError):
         bioenv(self.dm, df)
예제 #27
0
 def test_bioenv_different_column_order(self):
     # Specifying columns in a different order will change the row labels in
     # the results data frame as the column subsets will be reordered, but
     # the actual results (e.g., correlation coefficients) shouldn't change.
     obs = bioenv(self.dm, self.df, columns=self.cols[::-1])
     assert_frame_equal(obs, self.exp_results_different_column_order)
예제 #28
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.from_file(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_cls = ANOSIM
            elif method == 'permanova':
                method_cls = PERMANOVA

            method_inst = method_cls(dm, df, column=categories[0])
            results = method_inst(num_perms)

            with open(out_fp, 'w') as out_f:
                out_f.write(results.summary())
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)
            results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
예제 #29
0
 def test_bioenv_no_distance_matrix(self):
     with self.assertRaises(TypeError):
         bioenv('breh', self.df)