Exemplo n.º 1
0
    def _fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """This is a single round of the doubling process. It fits an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        if self.verbose:
            print("\nSingle round runtime target: {}".format(runtime_limit))

        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        if self.selection_method == 'qr':
            to_sample = linalg.pivot_columns(self.error_matrix)
        elif self.selection_method == 'min_variance':
            # select algorithms to sample only from subset of algorithms that will run in allocated time
            valid = np.where(
                t_predicted <= self.n_cores * runtime_limit / 2)[0]
            Y = self.Y[:rank, valid]
            # TODO: check if Y is rank-deficient, i.e. will ED problem fail?
            v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4,
                                     self.n_cores, Y, self.scalarization)
            to_sample = valid[np.where(v_opt > 0.9)[0]]
            if np.isnan(to_sample).any():
                to_sample = np.argsort(t_predicted)[:rank]

        elif self.selection_method == 'random':
            to_sample = []
            # set of algorithms that are predicted to finish within given budget
            to_sample_candidates = np.where(
                t_predicted <= runtime_limit / 2)[0]
            # remove algorithms that have been sampled already
            to_sample_candidates = list(
                set(to_sample_candidates) - self.sampled_indices)
            # if the remaining time is not sufficient for random sampling
            if len(to_sample_candidates) == 0:
                to_sample = np.array([np.argmin(t_predicted)])
            else:
                to_sample = np.random.choice(to_sample_candidates,
                                             min(self.n_cores,
                                                 len(to_sample_candidates)),
                                             replace=False)
        else:
            to_sample = np.arange(0, self.new_row.shape[1])

        if len(to_sample) == 0 and len(self.sampled_indices) == 0:
            # if no columns are selected in first iteration (log det instability), sample n fastest columns
            n = len(
                np.where(
                    np.cumsum(np.sort(t_predicted)) <= runtime_limit / 4)[0])
            if n > 0:
                to_sample = np.argsort(t_predicted)[:n]
            else:
                self.ensemble.fitted = False
                return

        start = time.time()
        if self.selection_method is not 'random':
            # only need to compute column entry if it has not been computed already
            to_sample = list(set(to_sample) - self.sampled_indices)
            if self.verbose:
                print('Sampling {} entries of new row...'.format(
                    len(to_sample)))

            p1 = mp.Pool(self.n_cores)
            sample_models = [
                Model(self.p_type, self.column_headings[i]['algorithm'],
                      self.column_headings[i]['hyperparameters'], self.verbose,
                      i) for i in to_sample
            ]
            sample_model_errors = [
                p1.apply_async(Model.kfold_fit_validate,
                               args=[m, x_train, y_train, 5])
                for m in sample_models
            ]
            p1.close()
            p1.join()

            # update sampled indices
            self.sampled_indices = self.sampled_indices.union(set(to_sample))
            for i, error in enumerate(sample_model_errors):
                cv_error, cv_predictions = error.get()
                sample_models[i].cv_error, sample_models[
                    i].cv_predictions = cv_error.mean(), cv_predictions
                sample_models[i].sampled = True
                self.new_row[:, to_sample[i]] = cv_error.mean()
                self.sampled_models[to_sample[i]] = sample_models[i]
            imputed = linalg.impute(self.error_matrix,
                                    self.new_row,
                                    list(self.sampled_indices),
                                    rank=rank)

            # impute ALL entries
            # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices))
            # self.new_row[:, unknown] = imputed[:, unknown]
            self.new_row = imputed.copy()

            # k-fold fit candidate learners of ensemble
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            # add best sampled model to list of candidate learners to avoid empty lists
            best_sampled_idx = list(self.sampled_indices)[int(
                np.argmin(self.new_row[:, list(self.sampled_indices)]))]
            assert self.sampled_models[best_sampled_idx] is not None
            candidate_indices = [best_sampled_idx]
            self.ensemble.candidate_learners.append(
                self.sampled_models[best_sampled_idx])
            for i in np.argsort(self.new_row[0]):
                if t_predicted[i] + t_predicted[candidate_indices].sum(
                ) <= remaining:
                    last = candidate_indices.pop()
                    assert last == best_sampled_idx
                    candidate_indices.append(i)
                    candidate_indices.append(last)
                    # if model has already been k-fold fitted, immediately add to candidate learners
                    if i in self.sampled_indices:
                        assert self.sampled_models[i] is not None
                        self.ensemble.candidate_learners.append(
                            self.sampled_models[i])
            # candidate learners that need to be k-fold fitted
            to_fit = list(set(candidate_indices) - self.sampled_indices)
        else:
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            to_fit = to_sample.copy()

        p2 = mp.Pool(self.n_cores)
        candidate_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_fit
        ]
        candidate_model_errors = [
            p2.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in candidate_models
        ]
        p2.close()
        p2.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_fit))
        for i, error in enumerate(candidate_model_errors):
            cv_error, cv_predictions = error.get()
            candidate_models[i].cv_error, candidate_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            candidate_models[i].sampled = True
            self.new_row[:, to_fit[i]] = cv_error.mean()
            self.sampled_models[to_fit[i]] = candidate_models[i]
            self.ensemble.candidate_learners.append(candidate_models[i])
        # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank)

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        # ensemble selection and fitting in the remaining time budget
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')
Exemplo n.º 2
0
    def fit(self, x_train, y_train):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then perform Bayesian optimization and construct an optimal
        ensemble model.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        print('Data={}'.format(x_train.shape))
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        known_indices = linalg.pivot_columns(self.error_matrix)

        print('Sampling {} entries of new row...'.format(len(known_indices)))
        pool1 = mp.Pool(self.n_cores)
        sample_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose) for i in known_indices
        ]
        sample_model_errors = [
            pool1.apply_async(Model.kfold_fit_validate,
                              args=[m, x_train, y_train, 5])
            for m in sample_models
        ]
        pool1.close()
        pool1.join()
        for i, error in enumerate(sample_model_errors):
            self.new_row[:, known_indices[i]] = error.get()[0].mean()
            # TODO: add predictions to second layer matrix?
        self.new_row = linalg.impute(self.error_matrix, self.new_row,
                                     known_indices)

        # Add new row to error matrix at the end (might be incorrect?)
        # self.error_matrix = np.vstack((self.error_matrix, self.new_row))

        # TODO: Fit ensemble candidates (?)

        if self.verbose:
            print('\nConducting Bayesian optimization...')
        n_models = 3
        pool2 = Pool(self.n_cores)
        bayesian_opt_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose)
            for i in np.argsort(self.new_row.flatten())[:n_models]
        ]
        optimized_hyperparams = pool2.map(Model.bayesian_optimize,
                                          bayesian_opt_models)
        pool2.close()
        pool2.join()
        for i, params in enumerate(optimized_hyperparams):
            bayesian_opt_models[i].hyperparameters = params
            self.ensemble.add_base_learner(bayesian_opt_models[i])
            self.optimized_settings.append({
                'algorithm':
                bayesian_opt_models[i].algorithm,
                'hyperparameters':
                bayesian_opt_models[i].hyperparameters
            })

        if self.verbose:
            print('\nFitting optimized ensemble...')
        self.ensemble.fit(x_train, y_train)
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')