예제 #1
0
    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
        """ Loads the data given a task or another source. """

        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
                                                             'overwrite the get_data method.')

        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()

        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
예제 #2
0
    def __init__(self,
                 task_id: Union[int, None] = None,
                 n_threads: int = 1,
                 rng: Union[np.random.RandomState, int, None] = None):
        """

        Parameters
        ----------
        task_id : int, None
        n_threads  : int, None
        rng : np.random.RandomState, int, None
        """

        super(XGBoostBenchmark, self).__init__(rng=rng)
        self.n_threads = n_threads
        self.task_id = task_id
        self.accuracy_scorer = make_scorer(accuracy_score)

        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
            self.get_data()
        self.categorical_data = np.array(
            [var_type == 'categorical' for var_type in variable_types])

        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
        categorical_idx = np.argwhere(self.categorical_data)
        continuous_idx = np.argwhere(~self.categorical_data)
        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
        self.categorical_data = self.categorical_data[sorting]
        self.x_train = self.x_train[:, sorting]
        self.x_valid = self.x_valid[:, sorting]
        self.x_test = self.x_test[:, sorting]

        nan_columns = np.all(np.isnan(self.x_train), axis=0)
        self.categorical_data = self.categorical_data[~nan_columns]

        self.x_train, self.x_valid, self.x_test, self.categories = \
            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
                                                                 is_categorical=self.categorical_data)

        # Determine the number of categories in the labels.
        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
        self.num_class = len(
            np.unique(np.concatenate([self.y_train, self.y_test,
                                      self.y_valid])))
        self.num_class = 1 if self.num_class == 2 else self.num_class

        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
                                         size=len(self.x_train),
                                         replace=False)

        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
        # (https://arxiv.org/pdf/1605.07079.pdf),
        # use 10 time the number of classes as lower bound for the dataset fraction
        n_classes = np.unique(self.y_train).shape[0]
        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
예제 #3
0
    def __init__(self,
                 task_id: Union[int, None] = None,
                 n_threads: int = 1,
                 rng: Union[np.random.RandomState, int, None] = None):
        """

        Parameters
        ----------
        task_id : int, None
        n_threads  : int, None
        rng : np.random.RandomState, int, None
        """

        super(XGBoostBenchmark, self).__init__(rng=rng)
        self.n_threads = n_threads
        self.task_id = task_id
        self.accuracy_scorer = make_scorer(accuracy_score)

        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \
            self.get_data()
        self.categorical_data = np.array(
            [var_type == 'categorical' for var_type in variable_types])

        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
        categorical_idx = np.argwhere(self.categorical_data)
        continuous_idx = np.argwhere(~self.categorical_data)
        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
        self.categorical_data = self.categorical_data[sorting]
        self.X_train = self.X_train[:, sorting]
        self.X_valid = self.X_valid[:, sorting]
        self.X_test = self.X_test[:, sorting]

        nan_columns = np.all(np.isnan(self.X_train), axis=0)
        self.categorical_data = self.categorical_data[~nan_columns]

        self.X_train, self.X_valid, self.X_test, self.categories = \
            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test,
                                                                 is_categorical=self.categorical_data)

        # Determine the number of categories in the labels.
        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
        self.num_class = len(
            np.unique(np.concatenate([self.y_train, self.y_test,
                                      self.y_valid])))
        self.num_class = 1 if self.num_class == 2 else self.num_class

        self.train_idx = self.rng.choice(a=np.arange(len(self.X_train)),
                                         size=len(self.X_train),
                                         replace=False)
예제 #4
0
    def __init__(self,
                 task_id: Union[int, None] = None,
                 rng: Union[np.random.RandomState, int, None] = None):
        """
        Parameters
        ----------
        task_id : int, None
        rng : np.random.RandomState, int, None
        """
        super(SupportVectorMachine, self).__init__(rng=rng)

        self.task_id = task_id
        self.cache_size = 200  # Cache for the SVC in MB
        self.accuracy_scorer = make_scorer(accuracy_score)

        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \
            self.get_data()
        self.categorical_data = np.array(
            [var_type == 'categorical' for var_type in variable_types])

        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
        categorical_idx = np.argwhere(self.categorical_data)
        continuous_idx = np.argwhere(~self.categorical_data)
        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
        self.categorical_data = self.categorical_data[sorting]
        self.X_train = self.X_train[:, sorting]
        self.X_valid = self.X_valid[:, sorting]
        self.X_test = self.X_test[:, sorting]

        nan_columns = np.all(np.isnan(self.X_train), axis=0)
        self.categorical_data = self.categorical_data[~nan_columns]
        self.X_train, self.X_valid, self.X_test, self.categories = \
            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test,
                                                                 is_categorical=self.categorical_data)

        self.train_idx = self.rng.choice(a=np.arange(len(self.X_train)),
                                         size=len(self.X_train),
                                         replace=False)

        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
        # (https://arxiv.org/pdf/1605.07079.pdf),
        # use 10 time the number of classes as lower bound for the dataset fraction
        n_classes = np.unique(self.y_train).shape[0]
        self.lower_bound_train_size = int(
            (10 * n_classes) / self.X_train.shape[0])
예제 #5
0
def test_convert_nan_values_in_cat_columns():
    x = np.array([[1, np.nan, 3, 4], [5, 6, 7, 8], [np.nan, 10, 11, np.nan]])

    is_cat = [True, True, False, False]

    x, _, _, categories = OpenMLHoldoutDataManager.replace_nans_in_cat_columns(
        x, x, x, is_cat)

    solution = np.array([[1., 5., 3., 4.], [5., 6., 7., 8.],
                         [0., 10., 11., np.nan]])

    solution_cat = np.array([[1., 5., 0.], [5., 6., 10.]])

    assert np.array_equiv(x[:, :3],
                          solution[:, :3])  # unfortunately np.nan != np.nan :)
    assert np.isnan(x[2, 3])

    cats = np.array(categories).flatten()
    cats.sort()
    solution_cat = solution_cat.flatten()
    solution_cat.sort()
    assert np.array_equal(cats, solution_cat)