def test_train_regressor_kwargs(self, space, fspace): """Test training models with kwargs""" array = flatten_numpy(to_numpy(data, space), fspace) model = train_regressor("RandomForestRegressor", array, max_depth=2, max_features="sqrt") assert model.max_depth == 2 assert model.max_features == "sqrt"
def test_to_numpy(space): """Test that trials are correctly converted to numpy array""" array = to_numpy(data, space) assert array.shape == (3, 4) numpy.testing.assert_equal(array[:, 0], data["x"]) numpy.testing.assert_equal(array[:, 1], data["y"]) numpy.testing.assert_equal(array[:, 2], data["z"]) numpy.testing.assert_equal(array[:, 3], data["objective"])
def test_train_regressor(self, space, fspace): """Test training different models""" array = flatten_numpy(to_numpy(data, space), fspace) model = train_regressor("AdaBoostRegressor", array) assert isinstance(model, AdaBoostRegressor) model = train_regressor("BaggingRegressor", array) assert isinstance(model, BaggingRegressor) model = train_regressor("ExtraTreesRegressor", array) assert isinstance(model, ExtraTreesRegressor) model = train_regressor("GradientBoostingRegressor", array) assert isinstance(model, GradientBoostingRegressor) model = train_regressor("RandomForestRegressor", array) assert isinstance(model, RandomForestRegressor)
def test_make_grid(): """Test grid has correct format""" trials = to_numpy(data, space) model = train_regressor("RandomForestRegressor", trials) best_point = trials[numpy.argmin(trials[:, -1])] grid = make_grid(best_point, space, model, 4) # Are fixed to anchor value numpy.testing.assert_equal(grid[0][:, 1], best_point[1]) numpy.testing.assert_equal(grid[1][:, 0], best_point[0]) # Is a grid in search space numpy.testing.assert_equal(grid[0][:, 0], [0, 2, 4, 6]) numpy.testing.assert_equal(grid[1][:, 1], [0, 1, 2, 3])
def test_make_grid_predictor(monkeypatch): """Test grid contains corresponding predictions from the model""" trials = to_numpy(data, space) model = train_regressor("RandomForestRegressor", trials) best_point = trials[numpy.argmin(trials[:, -1])] # Make sure model is not predicting exactly the original objective with numpy.testing.assert_raises(AssertionError): numpy.testing.assert_equal( best_point[-1], model.predict(best_point[:-1].reshape(1, -1)) ) grid = make_grid(best_point, space, model, 4) # Verify that grid predictions are those of the model numpy.testing.assert_equal(grid[0][:, -1], model.predict(grid[0][:, :-1])) numpy.testing.assert_equal(grid[1][:, -1], model.predict(grid[1][:, :-1])) # Verify model predictions differ on different points with numpy.testing.assert_raises(AssertionError): numpy.testing.assert_equal(grid[0][:, -1], grid[1][:, -1])
def lpi( trials, space, mode="best", model="RandomForestRegressor", n_points=20, n_runs=10, **kwargs ): """ Calculates the Local Parameter Importance for a collection of :class:`orion.core.worker.trial.Trial`. For more information on the metric, see original paper at https://ml.informatik.uni-freiburg.de/papers/18-LION12-CAVE.pdf. Biedenkapp, André, et al. "Cave: Configuration assessment, visualization and evaluation." International Conference on Learning and Intelligent Optimization. Springer, Cham, 2018. Parameters ---------- trials: DataFrame or dict A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict equivalent. space: Space object A space object from an experiment. mode: str Mode to compute the LPI. - ``best``: Take the best trial found as the anchor for the LPI - ``linear``: Recompute LPI for all values on a grid model: str Name of the regression model to use. Can be one of - AdaBoostRegressor - BaggingRegressor - ExtraTreesRegressor - GradientBoostingRegressor - RandomForestRegressor (Default) n_points: int Number of points to compute the variances. Default is 20. n_runs: int Number of runs to compute the standard error of the LPI. Default is 10. ``**kwargs`` Arguments for the regressor model. Returns ------- DataFrame LPI value for each parameter. If ``mode`` is `linear`, then a list of param values and LPI metrics are returned in a DataFrame format. """ flattened_space = build_required_space( space, dist_requirement="linear", type_requirement="numerical", shape_requirement="flattened", ) if trials.empty or trials.shape[0] == 0: return pd.DataFrame( data=[0] * len(flattened_space), index=flattened_space.keys(), columns=["LPI"], ) data = to_numpy(trials, space) data = flatten_numpy(data, flattened_space) best_point = data[numpy.argmin(data[:, -1])] rng = numpy.random.RandomState(kwargs.pop("random_state", None)) results = numpy.zeros((n_runs, len(flattened_space))) for i in range(n_runs): trained_model = train_regressor( model, data, random_state=rng.randint(2 ** 32 - 1), **kwargs ) results[i] = modes[mode](best_point, flattened_space, trained_model, n_points) averages = results.mean(0) standard_errors = results.std(0) frame = pd.DataFrame( data=numpy.array([averages, standard_errors]).T, index=flattened_space.keys(), columns=["LPI", "STD"], ) return frame
def partial_dependency(trials, space, params=None, model="RandomForestRegressor", n_grid_points=10, n_samples=50, **kwargs): """ Calculates the partial dependency of parameters in a collection of :class:`Trial`. Parameters ---------- trials: DataFrame or dict A dataframe of trials containing, at least, the columns 'objective' and 'id'. Or a dict equivalent. space: Space object A space object from an experiment. params: list of str, optional The parameters to include in the computation. All parameters are included by default. model: str Name of the regression model to use. Can be one of - AdaBoostRegressor - BaggingRegressor - ExtraTreesRegressor - GradientBoostingRegressor - RandomForestRegressor (Default) n_grid_points: int Number of points in the grid to compute partial dependency. Default is 10. n_samples: int Number of samples to randomly generate the grid used to compute the partial dependency. Default is 50. **kwargs Arguments for the regressor model. Returns ------- dict Dictionary of DataFrames. Each combination of parameters as keys (dim1.name, dim2.name) and for each parameters individually (dim1.name). Columns are (dim1.name, dim2.name, objective) or (dim1.name, objective). """ params = flatten_params(space, params) flattened_space = build_required_space( space, dist_requirement="linear", type_requirement="numerical", shape_requirement="flattened", ) if trials.empty or trials.shape[0] == 0: return {} data = to_numpy(trials, space) data = flatten_numpy(data, flattened_space) model = train_regressor(model, data, **kwargs) data = flattened_space.sample(n_samples) data = pandas.DataFrame(data, columns=flattened_space.keys()) partial_dependencies = dict() for x_i, x_name in enumerate(params): grid, averages, stds = partial_dependency_grid(flattened_space, model, [x_name], data, n_grid_points) grid = reverse(flattened_space, grid) partial_dependencies[x_name] = (grid, averages, stds) for y_i in range(x_i + 1, len(params)): y_name = params[y_i] grid, averages, stds = partial_dependency_grid( flattened_space, model, [x_name, y_name], data, n_grid_points) grid = reverse(flattened_space, grid) partial_dependencies[(x_name, y_name)] = (grid, averages, stds) return partial_dependencies
def test_train_regressor_invalid(self, space, fspace): """Test error message for invalid model names""" array = flatten_numpy(to_numpy(data, space), fspace) with pytest.raises(ValueError) as exc: train_regressor("IDontExist", array) assert exc.match("IDontExist is not a supported regressor")