def test_disable_repeats_when_disabled_shuffle(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": False,
            "stratify": False,
            "k_folds": 2,
            "repeats": 10,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "random_seed": 1,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        self.assertEqual(1, vl.get_repeats())
示例#2
0
class ValidationStep:
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        else:
            raise Exception("Other validation types are not implemented yet!")
        """
        elif self.validation_type == "split":
            self.validator = SplitValidator(params, data)
        elif self.validation_type == "with_dataset":
            self.validator = WithDatasetValidator(params, data)
        else:
            msg = "Unknown validation type: {0}".format(self.validation_type)
            raise ValidationStepException(msg)
        """

    def get_split(self, k):
        return self.validator.get_split(k)

    def split(self):
        return self.validator.split()

    def get_n_splits(self):
        return self.validator.get_n_splits()
示例#3
0
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        else:
            raise Exception("Other validation types are not implemented yet!")
        """
    def test_missing_data(self):
        with self.assertRaises(BaseValidatorException) as context:
            data = {"train": {"X": np.array([[0, 0], [0, 1], [1, 0], [1, 1]])}}
            params = {"shuffle": True, "stratify": True, "k_folds": 2}
            vl = KFoldValidator(data, params)

        self.assertTrue("Missing" in str(context.exception))
 def test_create_with_target_as_labels(self):
     data = {
         "train": {
             "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])),
             "y": pd.DataFrame(np.array(["a", "b", "a", "b"])),
         }
     }
     params = {"shuffle": True, "stratify": True, "k_folds": 2}
     vl = KFoldValidator(params, data)
     self.assertEqual(params["k_folds"], vl.get_n_splits())
     for train, validation in vl.split():
         X_train, y_train = train.get("X"), train.get("y")
         X_validation, y_validation = validation.get("X"), validation.get("y")
         self.assertEqual(X_train.shape[0], 2)
         self.assertEqual(y_train.shape[0], 2)
         self.assertEqual(X_validation.shape[0], 2)
         self.assertEqual(y_validation.shape[0], 2)
示例#6
0
    def test_missing_target_values(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(
                    np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
                    columns=["a", "b"],
                ),
                "y":
                pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan]),
                             columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": True,
            "stratify": True,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
 def test_missing_target_values(self):
     # rows with missing target will be distributed equaly among folds
     data = {
         "train": {
             "X": pd.DataFrame(
                 np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]])
             ),
             "y": pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan])),
         }
     }
     params = {"shuffle": True, "stratify": True, "k_folds": 2}
     vl = KFoldValidator(params, data)
     self.assertEqual(params["k_folds"], vl.get_n_splits())
     for train, validation in vl.split():
         X_train, y_train = train.get("X"), train.get("y")
         X_validation, y_validation = validation.get("X"), validation.get("y")
         self.assertEqual(X_train.shape[0], 3)
         self.assertEqual(y_train.shape[0], 3)
         self.assertEqual(X_validation.shape[0], 3)
         self.assertEqual(y_validation.shape[0], 3)
示例#8
0
    def test_create(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                             columns=["a", "b"]),
                "y":
                pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": False,
            "stratify": False,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        # for train, validation in vl.split():
        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)

            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
    def test_create_with_target_as_labels(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": True,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        elif self.validation_type == "split":
            self.validator = SplitValidator(params)
        else:
            raise AutoMLException(
                f"The validation type ({self.validation_type}) is not implemented."
            )
        """