def _train(self, X: np.ndarray, y: np.ndarray): """Trains the random forest on X and y. Parameters ---------- X : np.ndarray [n_samples, n_features (config + instance features)] Input data points. Y : np.ndarray [n_samples, ] The corresponding target values. Returns ------- self """ self.X = X self.y = y.flatten() if self.n_points_per_tree <= 0: self.rf_opts.num_data_points_per_tree = self.X.shape[0] else: self.rf_opts.num_data_points_per_tree = self.n_points_per_tree self.rf = regression.binary_rss_forest() self.rf.options = self.rf_opts data = self._init_data_container(self.X, self.y) self.rf.fit(data, rng=self.rng) return self
def _train(self, X: np.ndarray, y: np.ndarray) -> 'RandomForestWithInstancesHPO': """Trains the random forest on X and y. Parameters ---------- X : np.ndarray [n_samples, n_features (config + instance features)] Input data points. y : np.ndarray [n_samples, ] The corresponding target values. Returns ------- self """ X = self._impute_inactive(X) self.X = X self.y = y.flatten() cfg = self._get_configuration_space() # Draw 50 random samples and use best according to 10 CV best_error = None best_config = None if X.shape[0] > 3: for i in range(self.n_iters): if i == 0: configuration = cfg.get_default_configuration() else: configuration = cfg.sample_configuration() n_splits = min(X.shape[0], self.n_splits) kf = KFold(n_splits=n_splits) error = 0.0 for train_index, test_index in kf.split(X): error += self._eval_rf( c=configuration, X=X[train_index, :], y=y[train_index], X_test=X[test_index, :], y_test=y[test_index], ) self.logger.debug(error) if best_error is None or error < best_error: best_config = configuration best_error = error else: best_config = cfg.get_default_configuration() self.rf_opts = self._set_conf( c=best_config, n_features=self.X.shape[1], num_data_points=X.shape[0], ) self._set_hypers(best_config) self.logger.debug("Use %s" % str(self.rf_opts)) self.rf = regression.binary_rss_forest() self.rf.options = self.rf_opts data = self._init_data_container(self.X, self.y) self.rf.fit(data, rng=self.rng) return self
def __init__(self, num_trees=30, do_bootstrapping=True, n_points_per_tree=0, rng=None): """ Interface for the random_forest_run library to model the objective function with a random forest. Parameters ---------- num_trees: int The number of trees in the random forest. do_bootstrapping: bool Turns on / off bootstrapping in the random forest. n_points_per_tree: int Number of data point per tree. If set to 0 then we will use all data points in each tree rng: np.random.RandomState Random number generator """ if rng is None: self.rng = np.random.RandomState() else: self.rng = rng self.reg_rng = reg.default_random_engine(self.rng.randint(1000)) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree
def test_prediction(self): the_forest = reg.binary_rss_forest() the_forest.options.num_trees = 64 the_forest.options.do_bootstrapping = True the_forest.options.num_data_points_per_tree = 200 self.assertEqual(the_forest.options.num_trees, 64) self.assertTrue(the_forest.options.do_bootstrapping) self.assertEqual(the_forest.options.num_data_points_per_tree, 200) the_forest.fit(self.data, self.rng) the_forest.predict(self.data.retrieve_data_point(0))
def __init__(self, X_init: np.ndarray, Y_init: np.ndarray, num_trees: int = 30, do_bootstrapping: bool = True, n_points_per_tree: int = 0, seed: int = None) -> None: """ Interface to random forests for Bayesian optimization based on pyrfr package which due to the random splitting gives better uncertainty estimates than the sklearn random forest. Dependencies: AutoML rfr (https://github.com/automl/random_forest_run) :param X_init: Initial input data points to train the model :param Y_init: Initial target values :param num_trees: Specifies the number of trees to build the random forest :param do_bootstrapping: Defines if we use boostrapping for the individual trees or not :param n_points_per_tree: Specifies the number of points for each individual tree (0 mean no restriction) :param seed: Used to seed the random number generator for the random forest (None means random seed) """ super().__init__() # Set random number generator for the random forest if seed is None: seed = np.random.randint(10000) self.reg_rng = reg.default_random_engine(seed) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree self._X = X_init self._Y = Y_init if self.n_points_per_tree == 0: self.rf.options.num_data_points_per_tree = X_init.shape[0] data = reg.default_data_container(self._X.shape[1]) for row_X, row_y in zip(X_init, Y_init): data.add_data_point(row_X, row_y) self.rf.fit(data, self.reg_rng)
def setUp(self): data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/' self.data = reg.default_data_container(64) self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv') self.forest = reg.binary_rss_forest() self.forest.options.num_trees = 64 self.forest.options.do_bootstrapping = True self.forest.options.num_data_points_per_tree = 200 self.assertEqual(self.forest.options.num_trees, 64) self.assertTrue (self.forest.options.do_bootstrapping) self.assertEqual(self.forest.options.num_data_points_per_tree, 200) self.rng = reg.default_random_engine(1)
def test_first_nearest_neightbor(self): # if no bootstrapping is done, the tree gets all the data points, # all features are used for every split and all datapoints are unique, # a single tree will perfectly recall the datapoints the_forest = reg.binary_rss_forest() the_forest.options.num_trees = 1 the_forest.options.do_bootstrapping = False the_forest.options.num_data_points_per_tree = self.data.num_data_points( ) the_forest.options.tree_opts.max_features = self.data.num_features() the_forest.fit(self.data, self.rng) self.assertEqual(the_forest.num_trees(), 1) for i in range(self.data.num_data_points()): self.assertEqual( the_forest.predict(self.data.retrieve_data_point(i)), self.data.response(i))
def _eval_rf( self, c: Configuration, X: np.ndarray, y: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, ) -> float: """Evaluate random forest configuration on train/test data. Parameters ---------- c : Configuration Random forest configuration to evaluate on the train/test data X : np.ndarray [n_samples, n_features (config + instance features)] Training features y : np.ndarray [n_samples, ] Training targets X_test : np.ndarray [n_samples, n_features (config + instance features)] Validation features y_test : np.ndarray [n_samples, ] Validation targets Returns ------- float """ opts = self._set_conf(c, n_features=X.shape[1], num_data_points=X.shape[0]) rng = regression.default_random_engine(1) rf = regression.binary_rss_forest() rf.options = opts data = self._init_data_container(X, y) rf.fit(data, rng=rng) loss = 0 for row, lab in zip(X_test, y_test): m, v = rf.predict_mean_var(row) std = max(1e-8, np.sqrt(v)) nllh = -scst.norm(loc=m, scale=std).logpdf(lab) loss += nllh return loss
def __init__(self, num_trees=30, do_bootstrapping=True, n_points_per_tree=0, compute_oob_error=False, return_total_variance=True, rng=None): """ Interface for the random_forest_run library to model the objective function with a random forest. Parameters ---------- num_trees: int The number of trees in the random forest. do_bootstrapping: bool Turns on / off bootstrapping in the random forest. n_points_per_tree: int Number of data point per tree. If set to 0 then we will use all data points in each tree compute_oob_error: bool Turns on / off calculation of out-of-bag error. Default: False return_total_variance: bool Return law of total variance (mean of variances + variance of means, if True) or explained variance (variance of means, if False). Default: True rng: np.random.RandomState Random number generator """ if rng is None: self.rng = np.random.RandomState() else: self.rng = rng self.reg_rng = reg.default_random_engine(self.rng.randint(1000)) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree self.rf.options.compute_oob_error = compute_oob_error self.rf.options.compute_law_of_total_variance = return_total_variance
def setUp(self): self.X = [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 1.], [1., 0., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.]] self.y = [[50], [50], [50], [.2], [.2], [.2], [9], [9], [9], [9.2], [9.2], [9.2], [500], [500], [500], [10.2], [10.2], [10.2], [109.], [109.], [109.], [100], [100], [100]] self.y_dual = list(map(lambda x: [math.log10(x[0]), x[0]], self.y)) bounds = [(0, float('nan')), (0, float('nan')), (0, float('nan'))] def init_data(X, y, bounds): data = reg.default_data_container(len(X[0])) for i, (mn, mx) in enumerate(bounds): if math.isnan(mx): data.set_type_of_feature(i, mn) else: data.set_bounds_of_feature(i, mn, mx) for row_X, row_y in zip(X, y): data.add_data_point(row_X, row_y) return data self.data = init_data(self.X, self.y, bounds) self.data_dual = init_data(self.X, self.y_dual, bounds) self.forest = reg.binary_rss_forest() self.forest.options.num_trees = 64 self.forest.options.do_bootstrapping = True self.forest.options.num_data_points_per_tree = 200 self.forest.options.compute_law_of_total_variance = True self.assertEqual(self.forest.options.num_trees, 64) self.assertTrue(self.forest.options.do_bootstrapping) self.assertEqual(self.forest.options.num_data_points_per_tree, 200) self.assertTrue(self.forest.options.compute_law_of_total_variance) self.rng = reg.default_random_engine(1)
def test_pickling(self): the_forest = reg.binary_rss_forest() the_forest.options.num_trees = 16 the_forest.options.do_bootstrapping = True the_forest.options.num_data_points_per_tree = self.data.num_data_points( ) self.assertEqual(the_forest.options.num_trees, 16) the_forest.fit(self.data, self.rng) with tempfile.NamedTemporaryFile(mode='w+b', delete=False) as f: fname = f.name pickle.dump(the_forest, f) with open(fname, 'r+b') as fh: a_second_forest = pickle.load(fh) os.remove(fname) for i in range(self.data.num_data_points()): d = self.data.retrieve_data_point(i) self.assertEqual(the_forest.predict(d), a_second_forest.predict(d))
if __name__ == "__main__": cs = ConfigurationSpace() learning_rate = UniformFloatHyperparameter("learning_rate", 1e-4, 5e-3, default_value=3e-4) cs.add_hyperparameter(learning_rate) n_layer1 = UniformIntegerHyperparameter("n_layer1", 5, 50, default_value=32) cs.add_hyperparameter(n_layer1) n_layer2 = UniformIntegerHyperparameter("n_layer2", 30, 80, default_value=64) cs.add_hyperparameter(n_layer2) batch_size = UniformIntegerHyperparameter("batch_size", 10, 500, default_value=200) cs.add_hyperparameter(batch_size) types, bounds = get_types(cs) reg = regression.binary_rss_forest() rf_opts = regression.forest_opts() rf_opts.num_trees = 10 rf_opts.do_bootstrapping = True model = RandomForestWithInstances(types=types, bounds=bounds) x = np.array([[0.78105907, 0.33860037, 0.72826097, 0.02941158], [0.81160897, 0.63147998, 0.72826097, 0.04901943], [0.27800406, 0.36616871, 0.16304333, 0.24509794], [0.41242362, 0.37351241, 0.11956505, 0.4607843], [0.70162934, 0.15819312, 0.51086957, 0.10784298], [0.53869654, 0.86662495, 0.27173903, 0.22549009], [0.53665988, 0.68576624, 0.81521753, 0.06862728], [0.72199594, 0.18900731, 0.75000011, 0.36274504]], dtype=np.float64) y = np.array([0.544481, 2.34456, 0.654629, 0.576376, 0.603501, 0.506214, 0.416664, 0.483639]) print(x.dtype)
num_points = 8 features = np.array([np.linspace(-1, 1, num_points)]).transpose() x2 = np.array([np.linspace(-1, 1, 100)]).transpose() responses = np.exp(-np.power(features / 0.3, 2)).flatten( ) + 0.05 * np.random.randn(features.shape[0]) data = reg.default_data_container(1) for f, r in zip(features, responses): data.add_data_point(f, r) rng = reg.default_random_engine() # create an instance of a regerssion forest using binary splits and the RSS loss the_forest = reg.binary_rss_forest() the_forest.options.num_trees = 64 the_forest.options.num_data_points_per_tree = num_points the_forest.options.tree_opts.min_samples_in_leaf = 1 the_forest.fit(data, rng) fig, (ax1, ax2, ax3) = plt.subplots(3, sharex=True) predictions = np.array([the_forest.predict_mean_var(x) for x in x2]) ax1.fill_between(x2[:, 0], predictions[:, 0] - predictions[:, 1], predictions[:, 0] + predictions[:, 1], alpha=0.3) ax1.plot(x2, predictions[:, 0]) ax1.scatter(features, responses)