Пример #1
0
    def test_DataCleaner_feature_na_method(self):
        dc = DataCleaner(max_na_frac=0, feature_na_method="drop")
        df = self.test_df
        df["LUMO_energy"].iloc[40] = np.nan
        df["LUMO_energy"].iloc[110] = np.nan

        # Test normal dropping with transformation
        dffit = df.iloc[:100]
        fitted = dc.fit_transform(dffit, target=self.target)
        self.assertNotIn("LUMO_energy", fitted.columns)
        dftrans = df.iloc[100:]
        tranz = dc.transform(dftrans, target=self.target)
        self.assertNotIn("LUMO_energy", tranz.columns)

        # Test filling
        dc2 = DataCleaner(max_na_frac=0, feature_na_method="fill")
        fitted = dc2.fit_transform(dffit, target=self.target)
        true = fitted["LUMO_energy"].iloc[39]
        filled = fitted["LUMO_energy"].iloc[40]
        self.assertAlmostEqual(true, filled, places=10)
        self.assertTupleEqual((100, 417), fitted.shape)

        # Test mean
        dcmean = DataCleaner(max_na_frac=0, feature_na_method="mean")
        df["minimum X"].iloc[99] = np.nan
        minimum_x = dffit["minimum X"]
        mean_min_x = minimum_x[~minimum_x.isnull()].mean()
        fitted = dcmean.fit_transform(dffit, target=self.target)
        self.assertAlmostEqual(fitted["minimum X"].iloc[99],
                               mean_min_x,
                               places=10)
Пример #2
0
    def test_DataCleaner_sample_na_method(self):
        df = self.test_df
        df['HOMO_energy'].loc[40] = np.nan
        df['HOMO_energy'].loc[110] = np.nan

        # Test when transform method is fill
        dc = DataCleaner(max_na_frac=0.9,
                         feature_na_method="drop",
                         na_method_fit="drop",
                         na_method_transform="fill")
        dffit = df.loc[:100]
        fitted = dc.fit_transform(dffit, target=self.target)
        test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist())
        self.assertTupleEqual(fitted.shape, test_shape)  # minus one sample

        dftrans = df.iloc[100:]
        tranz = dc.transform(dftrans, target=self.target)
        self.assertTupleEqual(tranz.shape, dftrans.shape)

        # Test when transform method is mean
        dc2 = DataCleaner(max_na_frac=0.9,
                          feature_na_method="drop",
                          na_method_fit="drop",
                          na_method_transform="mean")
        fitted = dc2.fit_transform(dffit, target=self.target)
        test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist())
        self.assertTupleEqual(fitted.shape, test_shape)  # minus one sample

        dftrans = df.loc[100:]
        tranz = dc2.transform(dftrans, target=self.target)
        self.assertTupleEqual(tranz.shape, dftrans.shape)
        mean = dftrans.drop(110)["HOMO_energy"].mean()
        self.assertAlmostEqual(tranz["HOMO_energy"].loc[110], mean)
Пример #3
0
    def test_DataCleaner_na_method_feature_sample_interaction(self):
        dc = DataCleaner(
            max_na_frac=0.01,
            feature_na_method="mean",
            na_method_transform="fill",
            na_method_fit="fill",
        )
        df = self.test_df
        # Should be dropped
        df["maximum X"] = [np.nan] * len(df)
        # Should be filled via mean
        df["range X"] = [np.nan] * 100 + df["range X"].iloc[100:].tolist()
        # Should be filled by 39
        df["minimum X"].iloc[40] = np.nan

        mean = df["range X"].loc[~df["range X"].isnull()].mean()
        df = dc.fit_transform(df, self.target)
        self.assertNotIn("maximum X", df.columns)
        self.assertIn("range X", df.columns)

        for r in df["range X"].iloc[:100]:
            self.assertAlmostEqual(r, mean, places=5)

        self.assertIn("minimum X", df.columns)
        self.assertEqual(df["minimum X"].iloc[40], df["minimum X"].iloc[39])
Пример #4
0
    def test_DataCleaner(self):
        """
        A basic test ensuring Preprocess can handle numerical features and
        features/targets  that may be strings but should be numbers.

        Returns: None
        """
        df = self.test_df
        target = 'gap expt'
        dc = DataCleaner()

        # Test the case of numbers as strings
        df[target] = df[target].astype(str)
        df = dc.fit_transform(df, target)
        self.assertAlmostEqual(df[target].iloc[0], 0.35)

        # Test if there is an nan in target
        df[target].iloc[8] = np.nan
        df = dc.fit_transform(df, target)
        self.assertEqual(df.shape[0], self.test_df.shape[0] - 1)

        # Test if there is an nan in feature
        df['HOMO_energy'].iloc[40] = np.nan
        df = dc.fit_transform(df, target)
        self.assertEqual(df.shape[0], self.test_df.shape[0] - 2)

        # Test if nan threshold is exceeded for a feature
        df["LUMO_energy"].iloc[:-2] = [np.nan] * (df.shape[0] - 2)
        df = dc.fit_transform(df, target)
        self.assertEqual(df.shape[1], self.test_df.shape[1] - 1)

        # test transferability
        df2 = self.test_df
        df2 = df2.drop(columns=[target])
        df2 = dc.transform(df2, target)
        self.assertFalse(compare_columns(df, df2, ignore=target)["mismatch"])
        self.assertTrue(target not in df2.columns)