def _predict_runtime(self, x_train): # predict runtime for the training set of the new dataset. if self.verbose: print("Predicting pipeline running time ..") return convex_opt.predict_runtime(x_train.shape, saved_model='Class', model=self.runtime_predictor)
def fit(self, x_train, y_train, rank=None, runtime_limit=None): """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. rank (int): Rank of error matrix factorization. runtime_limit (float): Maximum time to allocate to AutoLearner fitting. """ # set to defaults if not provided rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01) runtime_limit = runtime_limit or self.runtime_limit if self.verbose: print('Fitting AutoLearner with max. runtime {}s'.format( runtime_limit)) t_predicted = convex_opt.predict_runtime( x_train.shape, runtime_matrix=self.runtime_matrix) t0 = time.time() while time.time() - t0 < runtime_limit / 2: # set of algorithms that are predicted to run in given budget options = np.where(t_predicted <= runtime_limit / 2 - (time.time() - t0))[0] # remove algorithms that have been sampled already options = list(set(options) - self.sampled_indices) if len(options) == 0: if len(self.ensemble.candidate_learners) == 0: to_sample = np.argmin(t_predicted) else: break else: to_sample = np.random.choice(options) self.sampled_indices.add(to_sample) self.sampled_models[to_sample] = Model( self.p_type, self.column_headings[to_sample]['algorithm'], self.column_headings[to_sample]['hyperparameters'], self.verbose, to_sample) self.sampled_models[to_sample].kfold_fit_validate( x_train, y_train, 5) self.ensemble.candidate_learners.append( self.sampled_models[to_sample]) if self.verbose: print('\nFitting ensemble of max. size {}...'.format( len(self.ensemble.candidate_learners))) remaining = runtime_limit - (time.time() - t0) self.ensemble.fit(x_train, y_train, remaining, self.fitted_models) for model in self.ensemble.base_learners: assert model.index is not None self.fitted_indices.add(model.index) self.fitted_models[model.index] = model self.ensemble.fitted = True if self.verbose: print('\nAutoLearner fitting complete.')
def fit(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime, and terminate when reaching the time limit.""" num_points, num_features = x_train.shape if self.verbose: print('\nShape of training dataset: {} data points, {} features'. format(num_points, num_features)) if num_points > 10000 and num_points / num_features > self.dataset_ratio_threshold: num_points_new = int( min(5000, num_features * self.dataset_ratio_threshold)) sampling_ratio = num_points_new / num_points print(sampling_ratio) df_x_train = pd.DataFrame(x_train) df_y_train = pd.DataFrame(y_train, columns=['labels']) df_resampled = df_x_train.join(df_y_train).groupby('labels').apply( pd.DataFrame.sample, frac=sampling_ratio).reset_index(drop=True) x_train = df_resampled.drop(['labels'], axis=1).values y_train = df_resampled['labels'].values if self.verbose: print( '\nTraining dataset resampled \nShape of resampled training dataset: {} data points, {} features' .format(x_train.shape[0], x_train.shape[1])) t_predicted = convex_opt.predict_runtime( x_train.shape, model_name=self.runtime_predictor) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] if self.build_ensemble: t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) else: t_init = self.runtime_limit / 2 times = [t_init] losses = [0.5] e_hat, actual_times, sampled, ensembles = [], [], [], [] start = time.time() def doubling(): k, t = ranks[0], times[0] counter, self.best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) if self.build_ensemble: self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) else: self.ensemble = Model_collection(self.p_type) self._fit(x_tr, y_tr, rank=k, runtime_limit=t) if self.build_ensemble and self.ensemble.fitted: if self.verbose: print( "\nGot a new ensemble in the round with rumtime target {} seconds" .format(t)) loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) self.best = counter else: ranks.append(k) counter += 1 times.append(2 * t) k = ranks[-1] t = times[-1] class TimeoutException(Exception): pass @contextmanager def time_limit(seconds): def signal_handler(signum, frame): raise TimeoutException("Time limit reached.") signal.signal(signal.SIGALRM, signal_handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) try: # set aside 3 seconds for initial and final processing steps with time_limit(self.runtime_limit - 3): doubling() except TimeoutException as e: if verbose: print("Time limit reached.") if self.build_ensemble: # after all iterations, restore best model try: self.new_row = e_hat[self.best] self.ensemble = ensembles[self.best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles } except IndexError: print( "No ensemble built within time limit. Please try increasing the time limit or allocate more computational resources." ) else: return
def _fit(self, x_train, y_train, rank=None, runtime_limit=None): """This is a single round of the doubling process. It fits an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. rank (int): Rank of error matrix factorization. runtime_limit (float): Maximum time to allocate to AutoLearner fitting. """ if self.verbose: print("\nSingle round runtime target: {}".format(runtime_limit)) # set to defaults if not provided rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01) runtime_limit = runtime_limit or self.runtime_limit if self.verbose: print('Fitting AutoLearner with max. runtime {}s'.format( runtime_limit)) t_predicted = convex_opt.predict_runtime( x_train.shape, runtime_matrix=self.runtime_matrix) if self.selection_method == 'qr': to_sample = linalg.pivot_columns(self.error_matrix) elif self.selection_method == 'min_variance': # select algorithms to sample only from subset of algorithms that will run in allocated time valid = np.where( t_predicted <= self.n_cores * runtime_limit / 2)[0] Y = self.Y[:rank, valid] # TODO: check if Y is rank-deficient, i.e. will ED problem fail? v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4, self.n_cores, Y, self.scalarization) to_sample = valid[np.where(v_opt > 0.9)[0]] if np.isnan(to_sample).any(): to_sample = np.argsort(t_predicted)[:rank] elif self.selection_method == 'random': to_sample = [] # set of algorithms that are predicted to finish within given budget to_sample_candidates = np.where( t_predicted <= runtime_limit / 2)[0] # remove algorithms that have been sampled already to_sample_candidates = list( set(to_sample_candidates) - self.sampled_indices) # if the remaining time is not sufficient for random sampling if len(to_sample_candidates) == 0: to_sample = np.array([np.argmin(t_predicted)]) else: to_sample = np.random.choice(to_sample_candidates, min(self.n_cores, len(to_sample_candidates)), replace=False) else: to_sample = np.arange(0, self.new_row.shape[1]) if len(to_sample) == 0 and len(self.sampled_indices) == 0: # if no columns are selected in first iteration (log det instability), sample n fastest columns n = len( np.where( np.cumsum(np.sort(t_predicted)) <= runtime_limit / 4)[0]) if n > 0: to_sample = np.argsort(t_predicted)[:n] else: self.ensemble.fitted = False return start = time.time() if self.selection_method is not 'random': # only need to compute column entry if it has not been computed already to_sample = list(set(to_sample) - self.sampled_indices) if self.verbose: print('Sampling {} entries of new row...'.format( len(to_sample))) p1 = mp.Pool(self.n_cores) sample_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], self.verbose, i) for i in to_sample ] sample_model_errors = [ p1.apply_async(Model.kfold_fit_validate, args=[m, x_train, y_train, 5]) for m in sample_models ] p1.close() p1.join() # update sampled indices self.sampled_indices = self.sampled_indices.union(set(to_sample)) for i, error in enumerate(sample_model_errors): cv_error, cv_predictions = error.get() sample_models[i].cv_error, sample_models[ i].cv_predictions = cv_error.mean(), cv_predictions sample_models[i].sampled = True self.new_row[:, to_sample[i]] = cv_error.mean() self.sampled_models[to_sample[i]] = sample_models[i] imputed = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank) # impute ALL entries # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices)) # self.new_row[:, unknown] = imputed[:, unknown] self.new_row = imputed.copy() # k-fold fit candidate learners of ensemble remaining = (runtime_limit - (time.time() - start)) * self.n_cores # add best sampled model to list of candidate learners to avoid empty lists best_sampled_idx = list(self.sampled_indices)[int( np.argmin(self.new_row[:, list(self.sampled_indices)]))] assert self.sampled_models[best_sampled_idx] is not None candidate_indices = [best_sampled_idx] self.ensemble.candidate_learners.append( self.sampled_models[best_sampled_idx]) for i in np.argsort(self.new_row[0]): if t_predicted[i] + t_predicted[candidate_indices].sum( ) <= remaining: last = candidate_indices.pop() assert last == best_sampled_idx candidate_indices.append(i) candidate_indices.append(last) # if model has already been k-fold fitted, immediately add to candidate learners if i in self.sampled_indices: assert self.sampled_models[i] is not None self.ensemble.candidate_learners.append( self.sampled_models[i]) # candidate learners that need to be k-fold fitted to_fit = list(set(candidate_indices) - self.sampled_indices) else: remaining = (runtime_limit - (time.time() - start)) * self.n_cores to_fit = to_sample.copy() p2 = mp.Pool(self.n_cores) candidate_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], self.verbose, i) for i in to_fit ] candidate_model_errors = [ p2.apply_async(Model.kfold_fit_validate, args=[m, x_train, y_train, 5]) for m in candidate_models ] p2.close() p2.join() # update sampled indices self.sampled_indices = self.sampled_indices.union(set(to_fit)) for i, error in enumerate(candidate_model_errors): cv_error, cv_predictions = error.get() candidate_models[i].cv_error, candidate_models[ i].cv_predictions = cv_error.mean(), cv_predictions candidate_models[i].sampled = True self.new_row[:, to_fit[i]] = cv_error.mean() self.sampled_models[to_fit[i]] = candidate_models[i] self.ensemble.candidate_learners.append(candidate_models[i]) # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank) if self.verbose: print('\nFitting ensemble of max. size {}...'.format( len(self.ensemble.candidate_learners))) # ensemble selection and fitting in the remaining time budget self.ensemble.fit(x_train, y_train, remaining, self.fitted_models) for model in self.ensemble.base_learners: assert model.index is not None self.fitted_indices.add(model.index) self.fitted_models[model.index] = model self.ensemble.fitted = True if self.verbose: print('\nAutoLearner fitting complete.')
def fit(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime, and terminate when reaching the time limit.""" t_predicted = convex_opt.predict_runtime(x_train.shape) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] if self.build_ensemble: t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) else: t_init = self.runtime_limit / 2 times = [t_init] losses = [0.5] e_hat, actual_times, sampled, ensembles = [], [], [], [] start = time.time() def doubling(): k, t = ranks[0], times[0] counter, self.best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() if self.build_ensemble: self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) else: self.ensemble = Model_collection(self.p_type) self._fit(x_tr, y_tr, rank=k, runtime_limit=t) if self.build_ensemble: loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) self.best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1 class TimeoutException(Exception): pass @contextmanager def time_limit(seconds): def signal_handler(signum, frame): raise TimeoutException("Time limit reached.") signal.signal(signal.SIGALRM, signal_handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) try: with time_limit(self.runtime_limit): doubling() except TimeoutException as e: if verbose: print("Time limit reached.") if self.build_ensemble: # after all iterations, restore best model self.new_row = e_hat[self.best] self.ensemble = ensembles[self.best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles } else: return
def fit_doubling(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime.""" t_predicted = convex_opt.predict_runtime(x_train.shape) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) times = [t_init] losses = [1.0] e_hat, actual_times, sampled, ensembles = [], [], [], [] k, t = ranks[0], times[0] start = time.time() counter, best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) self.fit(x_tr, y_tr, rank=k, runtime_limit=t) loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1 # after all iterations, restore best model self.new_row = e_hat[best] self.ensemble = ensembles[best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles }
""" This is a script which calls uses the runtime prediction model of OBOE. OBOE: https://github.com/udellgroup/oboe """ from sklearn.datasets import load_iris from auto_learner import AutoLearner import convex_opt X, y = load_iris(return_X_y=True) m = AutoLearner(runtime_limit=20) # Produce runtime prediction based on dataset size: t_predicted = convex_opt.predict_runtime(X.shape, runtime_matrix=m.runtime_matrix) # zip(list(m.runtime_matrix.columns), t_predicted) # relative(?) runtime per algorithm # Create an ensemble of learners for dataset (lots of warnings): m.fit(X, y) # [el.algorithm for el in m.ensemble.base_learners] # algorithms in ensemble