def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]: """ Loads the data given a task or another source. """ assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or ' 'overwrite the get_data method.') data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng) x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load() return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1, rng: Union[np.random.RandomState, int, None] = None): """ Parameters ---------- task_id : int, None n_threads : int, None rng : np.random.RandomState, int, None """ super(XGBoostBenchmark, self).__init__(rng=rng) self.n_threads = n_threads self.task_id = task_id self.accuracy_scorer = make_scorer(accuracy_score) self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \ self.get_data() self.categorical_data = np.array( [var_type == 'categorical' for var_type in variable_types]) # XGB needs sorted data. Data should be (Categorical + numerical) not mixed. categorical_idx = np.argwhere(self.categorical_data) continuous_idx = np.argwhere(~self.categorical_data) sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze() self.categorical_data = self.categorical_data[sorting] self.x_train = self.x_train[:, sorting] self.x_valid = self.x_valid[:, sorting] self.x_test = self.x_test[:, sorting] nan_columns = np.all(np.isnan(self.x_train), axis=0) self.categorical_data = self.categorical_data[~nan_columns] self.x_train, self.x_valid, self.x_test, self.categories = \ OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test, is_categorical=self.categorical_data) # Determine the number of categories in the labels. # In case of binary classification ``self.num_class`` has to be 1 for xgboost. self.num_class = len( np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid]))) self.num_class = 1 if self.num_class == 2 else self.num_class self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)), size=len(self.x_train), replace=False) # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets] # (https://arxiv.org/pdf/1605.07079.pdf), # use 10 time the number of classes as lower bound for the dataset fraction n_classes = np.unique(self.y_train).shape[0] self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1, rng: Union[np.random.RandomState, int, None] = None): """ Parameters ---------- task_id : int, None n_threads : int, None rng : np.random.RandomState, int, None """ super(XGBoostBenchmark, self).__init__(rng=rng) self.n_threads = n_threads self.task_id = task_id self.accuracy_scorer = make_scorer(accuracy_score) self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \ self.get_data() self.categorical_data = np.array( [var_type == 'categorical' for var_type in variable_types]) # XGB needs sorted data. Data should be (Categorical + numerical) not mixed. categorical_idx = np.argwhere(self.categorical_data) continuous_idx = np.argwhere(~self.categorical_data) sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze() self.categorical_data = self.categorical_data[sorting] self.X_train = self.X_train[:, sorting] self.X_valid = self.X_valid[:, sorting] self.X_test = self.X_test[:, sorting] nan_columns = np.all(np.isnan(self.X_train), axis=0) self.categorical_data = self.categorical_data[~nan_columns] self.X_train, self.X_valid, self.X_test, self.categories = \ OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test, is_categorical=self.categorical_data) # Determine the number of categories in the labels. # In case of binary classification ``self.num_class`` has to be 1 for xgboost. self.num_class = len( np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid]))) self.num_class = 1 if self.num_class == 2 else self.num_class self.train_idx = self.rng.choice(a=np.arange(len(self.X_train)), size=len(self.X_train), replace=False)
def __init__(self, task_id: Union[int, None] = None, rng: Union[np.random.RandomState, int, None] = None): """ Parameters ---------- task_id : int, None rng : np.random.RandomState, int, None """ super(SupportVectorMachine, self).__init__(rng=rng) self.task_id = task_id self.cache_size = 200 # Cache for the SVC in MB self.accuracy_scorer = make_scorer(accuracy_score) self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \ self.get_data() self.categorical_data = np.array( [var_type == 'categorical' for var_type in variable_types]) # Sort data (Categorical + numerical) so that categorical and continous are not mixed. categorical_idx = np.argwhere(self.categorical_data) continuous_idx = np.argwhere(~self.categorical_data) sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze() self.categorical_data = self.categorical_data[sorting] self.X_train = self.X_train[:, sorting] self.X_valid = self.X_valid[:, sorting] self.X_test = self.X_test[:, sorting] nan_columns = np.all(np.isnan(self.X_train), axis=0) self.categorical_data = self.categorical_data[~nan_columns] self.X_train, self.X_valid, self.X_test, self.categories = \ OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test, is_categorical=self.categorical_data) self.train_idx = self.rng.choice(a=np.arange(len(self.X_train)), size=len(self.X_train), replace=False) # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets] # (https://arxiv.org/pdf/1605.07079.pdf), # use 10 time the number of classes as lower bound for the dataset fraction n_classes = np.unique(self.y_train).shape[0] self.lower_bound_train_size = int( (10 * n_classes) / self.X_train.shape[0])
def test_convert_nan_values_in_cat_columns(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, 8], [np.nan, 10, 11, np.nan]]) is_cat = [True, True, False, False] x, _, _, categories = OpenMLHoldoutDataManager.replace_nans_in_cat_columns( x, x, x, is_cat) solution = np.array([[1., 5., 3., 4.], [5., 6., 7., 8.], [0., 10., 11., np.nan]]) solution_cat = np.array([[1., 5., 0.], [5., 6., 10.]]) assert np.array_equiv(x[:, :3], solution[:, :3]) # unfortunately np.nan != np.nan :) assert np.isnan(x[2, 3]) cats = np.array(categories).flatten() cats.sort() solution_cat = solution_cat.flatten() solution_cat.sort() assert np.array_equal(cats, solution_cat)