def test_bound_scaler() -> None: ref = p.Instrumentation( p.Array(shape=(1, 2)).set_bounds(-12, 12, method="arctan"), p.Array(shape=(2, )).set_bounds(-12, 12, full_range_sampling=False), lr=p.Log(lower=0.001, upper=1000), stuff=p.Scalar(lower=-1, upper=2), unbounded=p.Scalar(lower=-1, init=0.0), value=p.Scalar(), letter=p.Choice("abc"), ) # make sure the order is preserved using legacy split method expected = [x[1] for x in split_as_data_parameters(ref)] assert p.helpers.list_data(ref) == expected # check the bounds param = ref.spawn_child() scaler = utils.BoundScaler(param) output = scaler.transform([1.0] * param.dimension, lambda x: x) param.set_standardized_data(output) (array1, array2), values = param.value np.testing.assert_array_almost_equal(array1, [[12, 12]]) np.testing.assert_array_almost_equal(array2, [1, 1]) assert values["stuff"] == 2 assert values["unbounded"] == 1 assert values["value"] == 1 assert values["lr"] == pytest.approx(1000) # again, on the middle point output = scaler.transform([0] * param.dimension, lambda x: x) param.set_standardized_data(output) assert param.value[1]["lr"] == pytest.approx(1.0) assert param.value[1]["stuff"] == pytest.approx(0.5)
def test_bound_scaler() -> None: ref = p.Instrumentation( p.Array(shape=(1, 2)).set_bounds(-12, 12, method="arctan"), p.Array(shape=(2, )).set_bounds(-12, 12, full_range_sampling=False), lr=p.Log(lower=0.001, upper=1000), stuff=p.Scalar(lower=-1, upper=2), unbounded=p.Scalar(lower=-1, init=0.0), value=p.Scalar(), letter=p.Choice("abc"), ) param = ref.spawn_child() scaler = utils.BoundScaler(param) output = scaler.transform([1.0] * param.dimension, lambda x: x) param.set_standardized_data(output) (array1, array2), values = param.value np.testing.assert_array_almost_equal(array1, [[12, 12]]) np.testing.assert_array_almost_equal(array2, [1, 1]) assert values["stuff"] == 2 assert values["unbounded"] == 1 assert values["value"] == 1 np.testing.assert_almost_equal(values["lr"], 1000) # again, on the middle point output = scaler.transform([0] * param.dimension, lambda x: x) param.set_standardized_data(output) np.testing.assert_almost_equal(param.value[1]["lr"], 1.0) np.testing.assert_almost_equal(param.value[1]["stuff"], 0.5)
def __init__( self, regressor: str, data_dimension: tp.Optional[int] = None, dataset: str = "artificial", overfitter: bool = False ) -> None: self.regressor = regressor self.data_dimension = data_dimension self.dataset = dataset self.overfitter = overfitter self._descriptors: tp.Dict[str, tp.Any] = {} self.add_descriptors(regressor=regressor, data_dimension=data_dimension, dataset=dataset, overfitter=overfitter) self.name = regressor + f"Dim{data_dimension}" self.num_data = 120 # default for artificial function self._cross_val_num = 10 # number of cross validation # Dimension does not make sense if we use a real world dataset. assert bool("artificial" in dataset) == bool(data_dimension is not None) # Variables for storing the training set and the test set. self.X: np.ndarray = np.array([]) self.y: np.ndarray # Variables for storing the cross-validation splits. self.X_train_cv: tp.List[tp.Any] = [] # This will be the list of training subsets. self.X_valid_cv: tp.List[tp.Any] = [] # This will be the list of validation subsets. self.y_train_cv: tp.List[tp.Any] = [] self.y_valid_cv: tp.List[tp.Any] = [] self.X_train: np.ndarray self.y_train: np.ndarray self.X_test: np.ndarray self.y_test: np.ndarray evalparams: tp.Dict[str, tp.Any] = {} if regressor == "decision_tree_depth": # Only the depth, as an evaluation. parametrization = p.Instrumentation(depth=p.Scalar(lower=1, upper=1200).set_integer_casting()) # We optimize only the depth, so we fix all other parameters than the depth params = dict(noise_free=False, criterion="mse", min_samples_split=0.00001, regressor="decision_tree", alpha=1.0, learning_rate="no", activation="no", solver="no") elif regressor == "any": # First we define the list of parameters in the optimization parametrization = p.Instrumentation( depth=p.Scalar(lower=1, upper=1200).set_integer_casting(), # Depth, in case we use a decision tree. criterion=p.Choice(["mse", "friedman_mse", "mae"]), # Criterion for building the decision tree. min_samples_split=p.Log(lower=0.0000001, upper=1), # Min ratio of samples in a node for splitting. regressor=p.Choice(["mlp", "decision_tree"]), # Type of regressor. activation=p.Choice(["identity", "logistic", "tanh", "relu"]), # Activation function, in case we use a net. solver=p.Choice(["lbfgs", "sgd", "adam"]), # Numerical optimizer. learning_rate=p.Choice(["constant", "invscaling", "adaptive"]), # Learning rate schedule. alpha=p.Log(lower=0.0000001, upper=1.), # Complexity penalization. ) # noise_free is False (meaning that we consider the cross-validation loss) during the optimization. params = dict(noise_free=False) elif regressor == "decision_tree": # We specify below the list of hyperparameters for the decision trees. parametrization = p.Instrumentation( depth=p.Scalar(lower=1, upper=1200).set_integer_casting(), criterion=p.Choice(["mse", "friedman_mse", "mae"]), min_samples_split=p.Log(lower=0.0000001, upper=1), regressor="decision_tree", ) params = dict(noise_free=False, alpha=1.0, learning_rate="no", regressor="decision_tree", activation="no", solver="no") evalparams = dict(params, criterion="mse", min_samples_split=0.00001) elif regressor == "mlp": # Let us define the parameters of the neural network. parametrization = p.Instrumentation( activation=p.Choice(["identity", "logistic", "tanh", "relu"]), solver=p.Choice(["lbfgs", "sgd", "adam"]), regressor="mlp", learning_rate=p.Choice(["constant", "invscaling", "adaptive"]), alpha=p.Log(lower=0.0000001, upper=1.), ) params = dict(noise_free=False, regressor="mlp", depth=-3, criterion="no", min_samples_split=0.1) else: assert False, f"Problem type {regressor} undefined!" # build eval params if not specified if not evalparams: evalparams = dict(params) # For the evaluation we remove the noise (unless overfitter) evalparams["noise_free"] = not overfitter super().__init__(partial(self._ml_parametrization, **params), parametrization.set_name("")) self._evalparams = evalparams self.register_initialization(regressor=regressor, data_dimension=data_dimension, dataset=dataset, overfitter=overfitter)
def __init__(self, regressor: str, data_dimension: tp.Optional[int] = None, dataset: str = "artificial", overfitter: bool = False) -> None: self.regressor = regressor self.data_dimension = data_dimension self.dataset = dataset self.overfitter = overfitter self._descriptors: tp.Dict[str, tp.Any] = {} self.add_descriptors(regressor=regressor, data_dimension=data_dimension, dataset=dataset, overfitter=overfitter) self.name = regressor + f"Dim{data_dimension}" self.num_data: int = 0 # Dimension does not make sense if we use a real world dataset. assert bool("artificial" in dataset) == bool( data_dimension is not None) # Variables for storing the training set and the test set. self.X: np.ndarray = np.array([]) self.y: np.ndarray # Variables for storing the cross-validation splits. self.X_train: tp.List[tp.Any] = [ ] # This will be the list of training subsets. self.X_valid: tp.List[tp.Any] = [ ] # This will be the list of validation subsets. self.y_train: tp.List[tp.Any] = [] self.y_valid: tp.List[tp.Any] = [] self.X_test: np.ndarray self.y_test: np.ndarray if regressor == "decision_tree_depth": # Only the depth, as an evaluation. parametrization = p.Instrumentation( depth=p.Scalar(lower=1, upper=1200).set_integer_casting()) # We optimize only the depth, so we fix all other parameters than the depth, using "partial". super().__init__( partial(self._ml_parametrization, noise_free=False, criterion="mse", min_samples_split=0.00001, regressor="decision_tree", alpha=1.0, learning_rate="no", activation="no", solver="no"), parametrization) # For the evaluation, we remove the noise. self.evaluation_function = partial( self._ml_parametrization, # type: ignore noise_free=not overfitter, criterion="mse", min_samples_split=0.00001, regressor="decision_tree", alpha=1.0, learning_rate="no", activation="no", solver="no") elif regressor == "any": # First we define the list of parameters in the optimization parametrization = p.Instrumentation( depth=p.Scalar(lower=1, upper=1200).set_integer_casting( ), # Depth, in case we use a decision tree. criterion=p.Choice( ["mse", "friedman_mse", "mae"]), # Criterion for building the decision tree. min_samples_split=p.Log( lower=0.0000001, upper=1), # Min ratio of samples in a node for splitting. regressor=p.Choice(["mlp", "decision_tree"]), # Type of regressor. activation=p.Choice( ["identity", "logistic", "tanh", "relu"]), # Activation function, in case we use a net. solver=p.Choice(["lbfgs", "sgd", "adam"]), # Numerical optimizer. learning_rate=p.Choice(["constant", "invscaling", "adaptive" ]), # Learning rate schedule. alpha=p.Log(lower=0.0000001, upper=1.), # Complexity penalization. ) # Only the dimension is fixed, so "partial" is just used for fixing the dimension. # noise_free is False (meaning that we consider the cross-validation loss) during the optimization. super().__init__( partial(self._ml_parametrization, noise_free=False), parametrization) # For the evaluation we use the test set, which is big, so noise_free = True. self.evaluation_function = partial( self._ml_parametrization, # type: ignore noise_free=not overfitter) elif regressor == "decision_tree": # We specify below the list of hyperparameters for the decision trees. parametrization = p.Instrumentation( depth=p.Scalar(lower=1, upper=1200).set_integer_casting(), criterion=p.Choice(["mse", "friedman_mse", "mae"]), min_samples_split=p.Log(lower=0.0000001, upper=1), regressor="decision_tree", ) # We use "partial" for fixing the parameters of the neural network, given that we work on the decision tree only. super().__init__( partial(self._ml_parametrization, noise_free=False, alpha=1.0, learning_rate="no", regressor="decision_tree", activation="no", solver="no"), parametrization) # For the test we just switch noise_free to True. self.evaluation_function = partial( self._ml_parametrization, criterion="mse", # type: ignore min_samples_split=0.00001, regressor="decision_tree", noise_free=not overfitter, alpha=1.0, learning_rate="no", activation="no", solver="no") elif regressor == "mlp": # Let us define the parameters of the neural network. parametrization = p.Instrumentation( activation=p.Choice(["identity", "logistic", "tanh", "relu"]), solver=p.Choice(["lbfgs", "sgd", "adam"]), regressor="mlp", learning_rate=p.Choice(["constant", "invscaling", "adaptive"]), alpha=p.Log(lower=0.0000001, upper=1.), ) # And, using partial, we get rid of the parameters of the decision tree (we work on the neural net, not # on the decision tree). super().__init__( partial(self._ml_parametrization, noise_free=False, regressor="mlp", depth=-3, criterion="no", min_samples_split=0.1), parametrization) self.evaluation_function = partial( self._ml_parametrization, # type: ignore regressor="mlp", noise_free=not overfitter, depth=-3, criterion="no", min_samples_split=0.1) else: assert False, f"Problem type {regressor} undefined!" # assert data_dimension is not None or dataset[:10] != "artificial" # self.get_dataset(data_dimension, dataset) self.register_initialization(regressor=regressor, data_dimension=data_dimension, dataset=dataset, overfitter=overfitter)