예제 #1
0
 def _predict_runtime(self, x_train):
     # predict runtime for the training set of the new dataset.
     if self.verbose:
         print("Predicting pipeline running time ..")
     return convex_opt.predict_runtime(x_train.shape,
                                       saved_model='Class',
                                       model=self.runtime_predictor)
예제 #2
0
    def fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        t0 = time.time()
        while time.time() - t0 < runtime_limit / 2:
            # set of algorithms that are predicted to run in given budget
            options = np.where(t_predicted <= runtime_limit / 2 -
                               (time.time() - t0))[0]
            # remove algorithms that have been sampled already
            options = list(set(options) - self.sampled_indices)
            if len(options) == 0:
                if len(self.ensemble.candidate_learners) == 0:
                    to_sample = np.argmin(t_predicted)
                else:
                    break
            else:
                to_sample = np.random.choice(options)
            self.sampled_indices.add(to_sample)
            self.sampled_models[to_sample] = Model(
                self.p_type, self.column_headings[to_sample]['algorithm'],
                self.column_headings[to_sample]['hyperparameters'],
                self.verbose, to_sample)
            self.sampled_models[to_sample].kfold_fit_validate(
                x_train, y_train, 5)
            self.ensemble.candidate_learners.append(
                self.sampled_models[to_sample])

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        remaining = runtime_limit - (time.time() - t0)
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')
예제 #3
0
    def fit(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime, and terminate when reaching the time limit."""

        num_points, num_features = x_train.shape

        if self.verbose:
            print('\nShape of training dataset: {} data points, {} features'.
                  format(num_points, num_features))

        if num_points > 10000 and num_points / num_features > self.dataset_ratio_threshold:
            num_points_new = int(
                min(5000, num_features * self.dataset_ratio_threshold))
            sampling_ratio = num_points_new / num_points
            print(sampling_ratio)
            df_x_train = pd.DataFrame(x_train)
            df_y_train = pd.DataFrame(y_train, columns=['labels'])
            df_resampled = df_x_train.join(df_y_train).groupby('labels').apply(
                pd.DataFrame.sample,
                frac=sampling_ratio).reset_index(drop=True)
            x_train = df_resampled.drop(['labels'], axis=1).values
            y_train = df_resampled['labels'].values
            if self.verbose:
                print(
                    '\nTraining dataset resampled \nShape of resampled training dataset: {} data points, {} features'
                    .format(x_train.shape[0], x_train.shape[1]))

        t_predicted = convex_opt.predict_runtime(
            x_train.shape, model_name=self.runtime_predictor)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        if self.build_ensemble:
            t_init = 2**np.floor(
                np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
            t_init = max(1, t_init)
        else:
            t_init = self.runtime_limit / 2
        times = [t_init]
        losses = [0.5]

        e_hat, actual_times, sampled, ensembles = [], [], [], []

        start = time.time()

        def doubling():
            k, t = ranks[0], times[0]
            counter, self.best = 0, 0
            while time.time() - start < self.runtime_limit - t:
                if verbose:
                    print('Fitting with k={}, t={}'.format(k, t))
                if self.build_ensemble:
                    self.ensemble = Ensemble(self.p_type, self.ensemble_method,
                                             self.stacking_hyperparams)
                else:
                    self.ensemble = Model_collection(self.p_type)
                self._fit(x_tr, y_tr, rank=k, runtime_limit=t)
                if self.build_ensemble and self.ensemble.fitted:
                    if self.verbose:
                        print(
                            "\nGot a new ensemble in the round with rumtime target {} seconds"
                            .format(t))
                    loss = util.error(y_va, self.ensemble.predict(x_va),
                                      self.p_type)

                    # TEMPORARY: Record intermediate results

                    e_hat.append(np.copy(self.new_row))
                    actual_times.append(time.time() - start)
                    sampled.append(self.sampled_indices)
                    ensembles.append(self.ensemble)
                    losses.append(loss)

                    if loss == min(losses):
                        ranks.append(k + 1)
                        self.best = counter
                    else:
                        ranks.append(k)

                    counter += 1

                times.append(2 * t)
                k = ranks[-1]
                t = times[-1]

        class TimeoutException(Exception):
            pass

        @contextmanager
        def time_limit(seconds):
            def signal_handler(signum, frame):
                raise TimeoutException("Time limit reached.")

            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(seconds)
            try:
                yield
            finally:
                signal.alarm(0)

        try:
            # set aside 3 seconds for initial and final processing steps
            with time_limit(self.runtime_limit - 3):
                doubling()
        except TimeoutException as e:
            if verbose:
                print("Time limit reached.")

        if self.build_ensemble:
            # after all iterations, restore best model

            try:
                self.new_row = e_hat[self.best]
                self.ensemble = ensembles[self.best]

                return {
                    'ranks': ranks[:-1],
                    'runtime_limits': times[:-1],
                    'validation_loss': losses,
                    'predicted_new_row': e_hat,
                    'actual_runtimes': actual_times,
                    'sampled_indices': sampled,
                    'models': ensembles
                }
            except IndexError:
                print(
                    "No ensemble built within time limit. Please try increasing the time limit or allocate more computational resources."
                )
        else:
            return
예제 #4
0
    def _fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """This is a single round of the doubling process. It fits an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        if self.verbose:
            print("\nSingle round runtime target: {}".format(runtime_limit))

        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        if self.selection_method == 'qr':
            to_sample = linalg.pivot_columns(self.error_matrix)
        elif self.selection_method == 'min_variance':
            # select algorithms to sample only from subset of algorithms that will run in allocated time
            valid = np.where(
                t_predicted <= self.n_cores * runtime_limit / 2)[0]
            Y = self.Y[:rank, valid]
            # TODO: check if Y is rank-deficient, i.e. will ED problem fail?
            v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4,
                                     self.n_cores, Y, self.scalarization)
            to_sample = valid[np.where(v_opt > 0.9)[0]]
            if np.isnan(to_sample).any():
                to_sample = np.argsort(t_predicted)[:rank]

        elif self.selection_method == 'random':
            to_sample = []
            # set of algorithms that are predicted to finish within given budget
            to_sample_candidates = np.where(
                t_predicted <= runtime_limit / 2)[0]
            # remove algorithms that have been sampled already
            to_sample_candidates = list(
                set(to_sample_candidates) - self.sampled_indices)
            # if the remaining time is not sufficient for random sampling
            if len(to_sample_candidates) == 0:
                to_sample = np.array([np.argmin(t_predicted)])
            else:
                to_sample = np.random.choice(to_sample_candidates,
                                             min(self.n_cores,
                                                 len(to_sample_candidates)),
                                             replace=False)
        else:
            to_sample = np.arange(0, self.new_row.shape[1])

        if len(to_sample) == 0 and len(self.sampled_indices) == 0:
            # if no columns are selected in first iteration (log det instability), sample n fastest columns
            n = len(
                np.where(
                    np.cumsum(np.sort(t_predicted)) <= runtime_limit / 4)[0])
            if n > 0:
                to_sample = np.argsort(t_predicted)[:n]
            else:
                self.ensemble.fitted = False
                return

        start = time.time()
        if self.selection_method is not 'random':
            # only need to compute column entry if it has not been computed already
            to_sample = list(set(to_sample) - self.sampled_indices)
            if self.verbose:
                print('Sampling {} entries of new row...'.format(
                    len(to_sample)))

            p1 = mp.Pool(self.n_cores)
            sample_models = [
                Model(self.p_type, self.column_headings[i]['algorithm'],
                      self.column_headings[i]['hyperparameters'], self.verbose,
                      i) for i in to_sample
            ]
            sample_model_errors = [
                p1.apply_async(Model.kfold_fit_validate,
                               args=[m, x_train, y_train, 5])
                for m in sample_models
            ]
            p1.close()
            p1.join()

            # update sampled indices
            self.sampled_indices = self.sampled_indices.union(set(to_sample))
            for i, error in enumerate(sample_model_errors):
                cv_error, cv_predictions = error.get()
                sample_models[i].cv_error, sample_models[
                    i].cv_predictions = cv_error.mean(), cv_predictions
                sample_models[i].sampled = True
                self.new_row[:, to_sample[i]] = cv_error.mean()
                self.sampled_models[to_sample[i]] = sample_models[i]
            imputed = linalg.impute(self.error_matrix,
                                    self.new_row,
                                    list(self.sampled_indices),
                                    rank=rank)

            # impute ALL entries
            # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices))
            # self.new_row[:, unknown] = imputed[:, unknown]
            self.new_row = imputed.copy()

            # k-fold fit candidate learners of ensemble
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            # add best sampled model to list of candidate learners to avoid empty lists
            best_sampled_idx = list(self.sampled_indices)[int(
                np.argmin(self.new_row[:, list(self.sampled_indices)]))]
            assert self.sampled_models[best_sampled_idx] is not None
            candidate_indices = [best_sampled_idx]
            self.ensemble.candidate_learners.append(
                self.sampled_models[best_sampled_idx])
            for i in np.argsort(self.new_row[0]):
                if t_predicted[i] + t_predicted[candidate_indices].sum(
                ) <= remaining:
                    last = candidate_indices.pop()
                    assert last == best_sampled_idx
                    candidate_indices.append(i)
                    candidate_indices.append(last)
                    # if model has already been k-fold fitted, immediately add to candidate learners
                    if i in self.sampled_indices:
                        assert self.sampled_models[i] is not None
                        self.ensemble.candidate_learners.append(
                            self.sampled_models[i])
            # candidate learners that need to be k-fold fitted
            to_fit = list(set(candidate_indices) - self.sampled_indices)
        else:
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            to_fit = to_sample.copy()

        p2 = mp.Pool(self.n_cores)
        candidate_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_fit
        ]
        candidate_model_errors = [
            p2.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in candidate_models
        ]
        p2.close()
        p2.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_fit))
        for i, error in enumerate(candidate_model_errors):
            cv_error, cv_predictions = error.get()
            candidate_models[i].cv_error, candidate_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            candidate_models[i].sampled = True
            self.new_row[:, to_fit[i]] = cv_error.mean()
            self.sampled_models[to_fit[i]] = candidate_models[i]
            self.ensemble.candidate_learners.append(candidate_models[i])
        # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank)

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        # ensemble selection and fitting in the remaining time budget
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')
예제 #5
0
    def fit(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime, and terminate when reaching the time limit."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        if self.build_ensemble:
            t_init = 2**np.floor(
                np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
            t_init = max(1, t_init)
        else:
            t_init = self.runtime_limit / 2
        times = [t_init]
        losses = [0.5]

        e_hat, actual_times, sampled, ensembles = [], [], [], []

        start = time.time()

        def doubling():
            k, t = ranks[0], times[0]
            counter, self.best = 0, 0
            while time.time() - start < self.runtime_limit - t:
                if verbose:
                    print('Fitting with k={}, t={}'.format(k, t))
                t0 = time.time()
                if self.build_ensemble:
                    self.ensemble = Ensemble(self.p_type, self.ensemble_method,
                                             self.stacking_hyperparams)
                else:
                    self.ensemble = Model_collection(self.p_type)
                self._fit(x_tr, y_tr, rank=k, runtime_limit=t)
                if self.build_ensemble:
                    loss = util.error(y_va, self.ensemble.predict(x_va),
                                      self.p_type)

                    # TEMPORARY: Record intermediate results

                    e_hat.append(np.copy(self.new_row))
                    actual_times.append(time.time() - start)
                    sampled.append(self.sampled_indices)
                    ensembles.append(self.ensemble)
                    losses.append(loss)

                    if loss == min(losses):
                        ranks.append(k + 1)
                        self.best = counter
                    else:
                        ranks.append(k)

                    times.append(2 * t)
                    k = ranks[-1]
                    t = times[-1]
                    counter += 1

        class TimeoutException(Exception):
            pass

        @contextmanager
        def time_limit(seconds):
            def signal_handler(signum, frame):
                raise TimeoutException("Time limit reached.")

            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(seconds)
            try:
                yield
            finally:
                signal.alarm(0)

        try:
            with time_limit(self.runtime_limit):
                doubling()
        except TimeoutException as e:
            if verbose:
                print("Time limit reached.")

        if self.build_ensemble:
            # after all iterations, restore best model
            self.new_row = e_hat[self.best]
            self.ensemble = ensembles[self.best]

            return {
                'ranks': ranks[:-1],
                'runtime_limits': times[:-1],
                'validation_loss': losses,
                'predicted_new_row': e_hat,
                'actual_runtimes': actual_times,
                'sampled_indices': sampled,
                'models': ensembles
            }
        else:
            return
예제 #6
0
    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }
예제 #7
0
""" This is a script which calls uses the runtime prediction model of OBOE.
    OBOE: https://github.com/udellgroup/oboe
"""
from sklearn.datasets import load_iris
from auto_learner import AutoLearner
import convex_opt

X, y = load_iris(return_X_y=True)
m = AutoLearner(runtime_limit=20)

# Produce runtime prediction based on dataset size:
t_predicted = convex_opt.predict_runtime(X.shape,
                                         runtime_matrix=m.runtime_matrix)
# zip(list(m.runtime_matrix.columns), t_predicted)  # relative(?) runtime per algorithm

# Create an ensemble of learners for dataset (lots of warnings):
m.fit(X, y)
# [el.algorithm for el in m.ensemble.base_learners]  # algorithms in ensemble