def test_disable_repeats_when_disabled_shuffle(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": False,
            "stratify": False,
            "k_folds": 2,
            "repeats": 10,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "random_seed": 1,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        self.assertEqual(1, vl.get_repeats())
    def test_repeats(self):

        data = {
            "X":
            pd.DataFrame(
                np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1],
                          [1, 0], [1, 1]]),
                columns=["a", "b"],
            ),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]),
                         columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": False,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "repeats": 3,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())
        self.assertEqual(3, vl.get_repeats())

        cnt = 0
        for repeat in range(vl.get_repeats()):
            for k_fold in range(vl.get_n_splits()):
                train, validation = vl.get_split(k_fold, repeat)

                X_train, y_train = train.get("X"), train.get("y")
                X_validation, y_validation = validation.get(
                    "X"), validation.get("y")

                self.assertEqual(X_train.shape[0], 4)
                self.assertEqual(y_train.shape[0], 4)
                self.assertEqual(X_validation.shape[0], 4)
                self.assertEqual(y_validation.shape[0], 4)
                cnt += 1

        self.assertEqual(cnt, 3)
    def test_repeats(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": False,
            "k_folds": 2,
            "repeats": 10,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "random_seed": 1,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        self.assertEqual(params["repeats"], vl.get_repeats())

        for repeat in range(vl.get_repeats()):
            for k_fold in range(vl.get_n_splits()):
                train, validation = vl.get_split(k_fold, repeat)

                X_train, y_train = train.get("X"), train.get("y")
                X_validation, y_validation = validation.get(
                    "X"), validation.get("y")

                self.assertEqual(X_train.shape[0], 2)
                self.assertEqual(y_train.shape[0], 2)
                self.assertEqual(X_validation.shape[0], 2)
                self.assertEqual(y_validation.shape[0], 2)
    def test_missing_target_values(self):

        data = {
            "X":
            pd.DataFrame(
                np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
                columns=["a", "b"],
            ),
            "y":
            pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan]),
                         columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": False,
            "stratify": True,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 3)
            self.assertEqual(y_train.shape[0], 3)
            self.assertEqual(X_validation.shape[0], 3)
            self.assertEqual(y_validation.shape[0], 3)
    def test_create_with_target_as_labels(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": True,
            "train_ratio": 0.5,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
        }
        vl = SplitValidator(params)

        self.assertEqual(1, vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
示例#6
0
    def boost_params(self, current_models, results_path, total_time_limit):

        df_models, algorithms = self.df_models_algorithms(
            current_models, time_limit=0.1 * total_time_limit
        )
        best_model = None
        for i in range(df_models.shape[0]):
            if df_models["model_type"].iloc[i] in [
                "Ensemble",
                "Neural Network",
                "Nearest Neighbors",
            ]:
                continue
            if "RandomFeature" in df_models["model"].iloc[i].get_name():
                continue
            best_model = df_models["model"].iloc[i]
            break
        if best_model is None:
            return []

        # load predictions
        oof = best_model.get_out_of_folds()

        predictions = oof[[c for c in oof.columns if c.startswith("prediction")]]
        y = oof["target"]

        if self._ml_task == MULTICLASS_CLASSIFICATION:
            oh = OneHotEncoder(sparse=False)
            y_encoded = oh.fit_transform(np.array(y).reshape(-1, 1))
            residua = np.sum(
                np.abs(np.array(y_encoded) - np.array(predictions)), axis=1
            )
        else:
            residua = np.abs(np.array(y) - np.array(predictions).ravel())

        df_preds = pd.DataFrame(
            {"res": residua, "lp": range(residua.shape[0]), "target": np.array(y)}
        )

        df_preds = df_preds.sort_values(by="res", ascending=True)
        df_preds["order"] = range(residua.shape[0])
        df_preds["order"] = (df_preds["order"]) / residua.shape[0] / 5.0 + 0.9
        df_preds = df_preds.sort_values(by="lp", ascending=True)

        sample_weight_path = os.path.join(
            results_path, best_model.get_name() + "_sample_weight.data"
        )
        dump_data(
            sample_weight_path, pd.DataFrame({"sample_weight": df_preds["order"]})
        )

        generated_params = []

        params = copy.deepcopy(best_model.params)

        params["validation_strategy"]["sample_weight_path"] = sample_weight_path
        params["injected_sample_weight"] = True
        params["name"] += "_BoostOnErrors"
        params["status"] = "initialized"
        params["final_loss"] = None
        params["train_time"] = None
        params["data_type"] = "boost_on_error"
        if "model_architecture_json" in params["learner"]:
            del params["learner"]["model_architecture_json"]
        if self._optuna_time_budget is not None:
            params["optuna_time_budget"] = self._optuna_time_budget
            params["optuna_init_params"] = self._optuna_init_params
            params["optuna_verbose"] = self._optuna_verbose

        unique_params_key = MljarTuner.get_params_key(params)

        if unique_params_key not in self._unique_params_keys:
            generated_params += [params]

        return generated_params