예제 #1
0
    def test_disable_repeats_when_disabled_shuffle(self):

        data = {
            "X": pd.DataFrame(
                np.array(
                    [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
                ),
                columns=["a", "b"],
            ),
            "y": pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.parquet")
        y_path = os.path.join(self._results_path, "y.parquet")

        data["X"].to_parquet(X_path, index=False)
        data["y"].to_parquet(y_path, index=False)

        params = {
            "shuffle": False,
            "stratify": False,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "repeats": 3,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())
        self.assertEqual(1, vl.get_repeats())
예제 #2
0
class ValidationStep:
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        elif self.validation_type == "split":
            self.validator = SplitValidator(params)
        else:
            raise AutoMLException(
                f"The validation type ({self.validation_type}) is not implemented."
            )
        """
        
        elif self.validation_type == "with_dataset":
            self.validator = WithDatasetValidator(params, data)
        else:
            msg = "Unknown validation type: {0}".format(self.validation_type)
            raise ValidationStepException(msg)
        """

    def get_split(self, k, repeat=0):
        return self.validator.get_split(k, repeat)

    def split(self):
        return self.validator.split()

    def get_n_splits(self):
        return self.validator.get_n_splits()

    def get_repeats(self):
        return self.validator.get_repeats()
예제 #3
0
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        elif self.validation_type == "split":
            self.validator = SplitValidator(params)
        else:
            raise AutoMLException(
                f"The validation type ({self.validation_type}) is not implemented."
            )
        """
    def test_missing_target_values(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(
                    np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
                    columns=["a", "b"],
                ),
                "y":
                pd.DataFrame(np.array(["a", "b", np.nan, "a", "b", np.nan]),
                             columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": False,
            "stratify": False,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 3)
            self.assertEqual(y_train.shape[0], 3)
            self.assertEqual(X_validation.shape[0], 3)
            self.assertEqual(y_validation.shape[0], 3)
    def test_create(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0],
                                       [0, 1], [1, 0], [1, 1]]),
                             columns=["a", "b"]),
                "y":
                pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]),
                             columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": False,
            "stratify": False,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())
        # for train, validation in vl.split():
        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)

            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 4)
            self.assertEqual(y_train.shape[0], 4)
            self.assertEqual(X_validation.shape[0], 4)
            self.assertEqual(y_validation.shape[0], 4)
    def test_create_with_target_as_labels(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": True,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
    def test_repeats(self):

        data = {
            "X":
            pd.DataFrame(
                np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1],
                          [1, 0], [1, 1]]),
                columns=["a", "b"],
            ),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]),
                         columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": False,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "repeats": 3,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())
        self.assertEqual(3, vl.get_repeats())

        cnt = 0
        for repeat in range(vl.get_repeats()):
            for k_fold in range(vl.get_n_splits()):
                train, validation = vl.get_split(k_fold, repeat)

                X_train, y_train = train.get("X"), train.get("y")
                X_validation, y_validation = validation.get(
                    "X"), validation.get("y")

                self.assertEqual(X_train.shape[0], 4)
                self.assertEqual(y_train.shape[0], 4)
                self.assertEqual(X_validation.shape[0], 4)
                self.assertEqual(y_validation.shape[0], 4)
                cnt += 1

        self.assertEqual(cnt, 3)