def test_disable_repeats_when_disabled_shuffle(self): data = { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": False, "stratify": False, "k_folds": 2, "repeats": 10, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, "random_seed": 1, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) self.assertEqual(1, vl.get_repeats())
class ValidationStep: def __init__(self, params): # kfold is default validation technique self.validation_type = params.get("validation_type", "kfold") if self.validation_type == "kfold": self.validator = KFoldValidator(params) else: raise Exception("Other validation types are not implemented yet!") """ elif self.validation_type == "split": self.validator = SplitValidator(params, data) elif self.validation_type == "with_dataset": self.validator = WithDatasetValidator(params, data) else: msg = "Unknown validation type: {0}".format(self.validation_type) raise ValidationStepException(msg) """ def get_split(self, k): return self.validator.get_split(k) def split(self): return self.validator.split() def get_n_splits(self): return self.validator.get_n_splits()
def __init__(self, params): # kfold is default validation technique self.validation_type = params.get("validation_type", "kfold") if self.validation_type == "kfold": self.validator = KFoldValidator(params) else: raise Exception("Other validation types are not implemented yet!") """
def test_missing_data(self): with self.assertRaises(BaseValidatorException) as context: data = {"train": {"X": np.array([[0, 0], [0, 1], [1, 0], [1, 1]])}} params = {"shuffle": True, "stratify": True, "k_folds": 2} vl = KFoldValidator(data, params) self.assertTrue("Missing" in str(context.exception))
def test_create_with_target_as_labels(self): data = { "train": { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])), "y": pd.DataFrame(np.array(["a", "b", "a", "b"])), } } params = {"shuffle": True, "stratify": True, "k_folds": 2} vl = KFoldValidator(params, data) self.assertEqual(params["k_folds"], vl.get_n_splits()) for train, validation in vl.split(): X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get("y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def test_missing_target_values(self): data = { "train": { "X": pd.DataFrame( np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]), columns=["a", "b"], ), "y": pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan]), columns=["target"]), } } X_train_path = os.path.join(self._results_path, "X_train.parquet") y_train_path = os.path.join(self._results_path, "y_train.parquet") data["train"]["X"].to_parquet(X_train_path, index=False) data["train"]["y"].to_parquet(y_train_path, index=False) params = { "shuffle": True, "stratify": True, "k_folds": 2, "results_path": self._results_path, "X_train_path": X_train_path, "y_train_path": y_train_path, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get( "y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def test_missing_target_values(self): # rows with missing target will be distributed equaly among folds data = { "train": { "X": pd.DataFrame( np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]) ), "y": pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan])), } } params = {"shuffle": True, "stratify": True, "k_folds": 2} vl = KFoldValidator(params, data) self.assertEqual(params["k_folds"], vl.get_n_splits()) for train, validation in vl.split(): X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get("y") self.assertEqual(X_train.shape[0], 3) self.assertEqual(y_train.shape[0], 3) self.assertEqual(X_validation.shape[0], 3) self.assertEqual(y_validation.shape[0], 3)
def test_create(self): data = { "train": { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), } } X_train_path = os.path.join(self._results_path, "X_train.parquet") y_train_path = os.path.join(self._results_path, "y_train.parquet") data["train"]["X"].to_parquet(X_train_path, index=False) data["train"]["y"].to_parquet(y_train_path, index=False) params = { "shuffle": False, "stratify": False, "k_folds": 2, "results_path": self._results_path, "X_train_path": X_train_path, "y_train_path": y_train_path, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) # for train, validation in vl.split(): for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get( "y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def test_create_with_target_as_labels(self): data = { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": True, "stratify": True, "k_folds": 2, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get( "y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def __init__(self, params): # kfold is default validation technique self.validation_type = params.get("validation_type", "kfold") if self.validation_type == "kfold": self.validator = KFoldValidator(params) elif self.validation_type == "split": self.validator = SplitValidator(params) else: raise AutoMLException( f"The validation type ({self.validation_type}) is not implemented." ) """