def test_disable_repeats_when_disabled_shuffle(self): data = { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": False, "stratify": False, "k_folds": 2, "repeats": 10, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, "random_seed": 1, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) self.assertEqual(1, vl.get_repeats())
def test_repeats(self): data = { "X": pd.DataFrame( np.array([[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"], ), "y": pd.DataFrame(np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": True, "stratify": False, "train_ratio": 0.5, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, "repeats": 3, } vl = SplitValidator(params) self.assertEqual(1, vl.get_n_splits()) self.assertEqual(3, vl.get_repeats()) cnt = 0 for repeat in range(vl.get_repeats()): for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold, repeat) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get( "X"), validation.get("y") self.assertEqual(X_train.shape[0], 4) self.assertEqual(y_train.shape[0], 4) self.assertEqual(X_validation.shape[0], 4) self.assertEqual(y_validation.shape[0], 4) cnt += 1 self.assertEqual(cnt, 3)
def test_repeats(self): data = { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": True, "stratify": False, "k_folds": 2, "repeats": 10, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, "random_seed": 1, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) self.assertEqual(params["repeats"], vl.get_repeats()) for repeat in range(vl.get_repeats()): for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold, repeat) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get( "X"), validation.get("y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def test_missing_target_values(self): data = { "X": pd.DataFrame( np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]), columns=["a", "b"], ), "y": pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": False, "stratify": True, "k_folds": 2, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, } vl = KFoldValidator(params) self.assertEqual(params["k_folds"], vl.get_n_splits()) for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get( "y") self.assertEqual(X_train.shape[0], 3) self.assertEqual(y_train.shape[0], 3) self.assertEqual(X_validation.shape[0], 3) self.assertEqual(y_validation.shape[0], 3)
def test_create_with_target_as_labels(self): data = { "X": pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]), "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]), } X_path = os.path.join(self._results_path, "X.data") y_path = os.path.join(self._results_path, "y.data") dump_data(X_path, data["X"]) dump_data(y_path, data["y"]) params = { "shuffle": True, "stratify": True, "train_ratio": 0.5, "results_path": self._results_path, "X_path": X_path, "y_path": y_path, } vl = SplitValidator(params) self.assertEqual(1, vl.get_n_splits()) for k_fold in range(vl.get_n_splits()): train, validation = vl.get_split(k_fold) X_train, y_train = train.get("X"), train.get("y") X_validation, y_validation = validation.get("X"), validation.get( "y") self.assertEqual(X_train.shape[0], 2) self.assertEqual(y_train.shape[0], 2) self.assertEqual(X_validation.shape[0], 2) self.assertEqual(y_validation.shape[0], 2)
def boost_params(self, current_models, results_path, total_time_limit): df_models, algorithms = self.df_models_algorithms( current_models, time_limit=0.1 * total_time_limit ) best_model = None for i in range(df_models.shape[0]): if df_models["model_type"].iloc[i] in [ "Ensemble", "Neural Network", "Nearest Neighbors", ]: continue if "RandomFeature" in df_models["model"].iloc[i].get_name(): continue best_model = df_models["model"].iloc[i] break if best_model is None: return [] # load predictions oof = best_model.get_out_of_folds() predictions = oof[[c for c in oof.columns if c.startswith("prediction")]] y = oof["target"] if self._ml_task == MULTICLASS_CLASSIFICATION: oh = OneHotEncoder(sparse=False) y_encoded = oh.fit_transform(np.array(y).reshape(-1, 1)) residua = np.sum( np.abs(np.array(y_encoded) - np.array(predictions)), axis=1 ) else: residua = np.abs(np.array(y) - np.array(predictions).ravel()) df_preds = pd.DataFrame( {"res": residua, "lp": range(residua.shape[0]), "target": np.array(y)} ) df_preds = df_preds.sort_values(by="res", ascending=True) df_preds["order"] = range(residua.shape[0]) df_preds["order"] = (df_preds["order"]) / residua.shape[0] / 5.0 + 0.9 df_preds = df_preds.sort_values(by="lp", ascending=True) sample_weight_path = os.path.join( results_path, best_model.get_name() + "_sample_weight.data" ) dump_data( sample_weight_path, pd.DataFrame({"sample_weight": df_preds["order"]}) ) generated_params = [] params = copy.deepcopy(best_model.params) params["validation_strategy"]["sample_weight_path"] = sample_weight_path params["injected_sample_weight"] = True params["name"] += "_BoostOnErrors" params["status"] = "initialized" params["final_loss"] = None params["train_time"] = None params["data_type"] = "boost_on_error" if "model_architecture_json" in params["learner"]: del params["learner"]["model_architecture_json"] if self._optuna_time_budget is not None: params["optuna_time_budget"] = self._optuna_time_budget params["optuna_init_params"] = self._optuna_init_params params["optuna_verbose"] = self._optuna_verbose unique_params_key = MljarTuner.get_params_key(params) if unique_params_key not in self._unique_params_keys: generated_params += [params] return generated_params