def fit(self, datanode): model_cnt = 0 for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data for _config in configs: if self.base_model_mask[model_cnt] == 1: estimator = fetch_predict_estimator( self.task_type, _config, X, y) with open( os.path.join( self.output_dir, '%s-bagging-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) model_cnt += 1 return self
def fit(self, data): # Split training data for phase 1 and phase 2 if self.task_type in CLS_TASKS: kf = StratifiedKFold(n_splits=self.kfold) else: kf = KFold(n_splits=self.kfold) # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data for _config in configs: if self.base_model_mask[model_cnt] == 1: for j, (train, test) in enumerate(kf.split(X, y)): x_p1, x_p2, y_p1, _ = X[train], X[test], y[ train], y[test] estimator = fetch_predict_estimator( self.task_type, _config, x_p1, y_p1) with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, y) return self
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, metric: _BaseScorer, save_model=False, output_dir=None): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.model_cnt = 0 self.save_model = save_model self.output_dir = output_dir self.train_predictions = [] self.config_list = [] self.train_data_dict = {} self.train_labels = None self.seed = self.stats['split_seed'] self.timestamp = str(time.time()) for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data # TODO: Hyperparameter test_size = 0.2 if self.task_type in CLS_TASKS: ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) else: ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(X, y): X_train, X_valid = X[train_index], X[test_index] y_train, y_valid = y[train_index], y[test_index] if self.train_labels is not None: assert (self.train_labels == y_valid).all() else: self.train_labels = y_valid for _config in configs: self.config_list.append(_config) self.train_data_dict[self.model_cnt] = (X, y) estimator = fetch_predict_estimator(self.task_type, _config, X_train, y_train) if self.save_model: with open(os.path.join(self.output_dir, '%s-model%d' % (self.timestamp, self.model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: y_valid_pred = estimator.predict_proba(X_valid) else: y_valid_pred = estimator.predict(X_valid) self.train_predictions.append(y_valid_pred) self.model_cnt += 1 if len(self.train_predictions) < self.ensemble_size: self.ensemble_size = len(self.train_predictions) if ensemble_method == 'ensemble_selection': return if task_type in CLS_TASKS: self.base_model_mask = choose_base_models_classification(np.array(self.train_predictions), self.ensemble_size) else: self.base_model_mask = choose_base_models_regression(np.array(self.train_predictions), np.array(y_valid), self.ensemble_size) self.ensemble_size = sum(self.base_model_mask)
def fit(self, train_data: DataNode, dataset_id=None): """ this function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ # Initialize each algorithm's solver. for _algo in self.include_algorithms: self.solvers[_algo] = SecondLayerBandit(self.task_type, _algo, train_data, metric=self.metric, output_dir=self.output_dir, per_run_time_limit=self.per_run_time_limit, seed=self.seed, eval_type=self.evaluation_type, dataset_id=dataset_id, n_jobs=self.n_jobs, mth='alter_hpo') # Set the resource limit. if self.time_limit is not None: time_limit_per_algo = self.time_limit / len(self.include_algorithms) max_iter_num = 999999 else: time_limit_per_algo = None max_iter_num = self.iter_num_per_algo # Optimize each algorithm with corresponding solver. for algo in self.include_algorithms: _start_time, _iter_id = time.time(), 0 solver = self.solvers[algo] while _iter_id < max_iter_num: result = solver.play_once() print('optimize %s in %d-th iteration: %.3f' % (algo, _iter_id, result)) _iter_id += 1 if self.time_limit is not None: if time.time() - _start_time >= time_limit_per_algo: break if solver.early_stopped_flag: break for algo_id in self.include_algorithms: if self.solvers[algo_id].incumbent_perf > self.best_perf: self.best_perf = self.solvers[algo_id].incumbent_perf self.best_algo_id = algo_id self.best_data_node = self.solvers[self.best_algo_id].inc['fe'] self.fe_optimizer = self.solvers[self.best_algo_id].optimizer['fe'] self.best_config = self.solvers[self.best_algo_id].inc['hpo'] if self.ensemble_method is not None: self.stats = self.fetch_ensemble_members() # Ensembling all intermediate/ultimate models found in above optimization process. # TODO: version1.0, support multiple ensemble methods. self.es = EnsembleBuilder(stats=self.stats, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric, output_dir=self.output_dir) self.es.fit(data=train_data) else: best_estimator = fetch_predict_estimator(self.task_type, self.best_config, self.best_data_node.data[0], self.best_data_node.data[1]) with open(os.path.join(self.output_dir, '%s-best_model' % str(self.timestamp)), 'wb') as f: pkl.dump(best_estimator, f)
def fit(self, data): # Split training data for phase 1 and phase 2 test_size = 0.2 # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data if self.task_type in CLS_TASKS: x_p1, x_p2, y_p1, y_p2 = train_test_split( X, y, test_size=test_size, stratify=data.data[1], random_state=self.seed) else: x_p1, x_p2, y_p1, y_p2 = train_test_split( X, y, test_size=test_size, random_state=self.seed) for _config in configs: if self.base_model_mask[model_cnt] == 1: estimator = fetch_predict_estimator( self.task_type, _config, x_p1, y_p1) with open( os.path.join( self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 self.meta_learner.fit(feature_p2, y_p2) return self
def predict_proba(self, X_test, is_weighted=False): """ weight source: ... model 1: local_inc['fe'], default_hpo model 2: default_fe, local_inc['hpo'] model 3: local_inc['fe'], local_inc['hpo'] :param X_test: :param is_weighted: :return: """ X_train_ori, y_train_ori = self.original_data.data X_train_inc, y_train_inc = self.local_inc['fe'].data model1_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_inc, y_train_inc) model2_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_ori, y_train_ori) model3_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_inc, y_train_inc) model4_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_ori, y_train_ori) if is_weighted: # Based on performance on the validation set # TODO: Save the results so that the models will not be trained again from automlToolkit.components.ensemble.ensemble_selection import EnsembleSelection from autosklearn.metrics import balanced_accuracy sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=1) X, y = X_train_ori.copy(), y_train_ori.copy() _X, _y = X_train_inc.copy(), y_train_inc.copy() for train_index, test_index in sss.split(X, y): X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index] _X_train, _X_val, _y_train, _y_val = _X[train_index], _X[test_index], _y[train_index], _y[test_index] assert (y_val == _y_val).all() model1_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, _X_train, _y_train) model2_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train, y_train) model3_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], _X_train, _y_train) model4_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, X_train, y_train) pred1 = model1_clf_temp.predict_proba(_X_val) pred2 = model2_clf_temp.predict_proba(X_val) pred3 = model3_clf_temp.predict_proba(_X_val) pred4 = model4_clf_temp.predict_proba(X_val) # Ensemble size is a hyperparameter es = EnsembleSelection(ensemble_size=20, task_type=1, metric=balanced_accuracy, random_state=np.random.RandomState(self.seed)) es.fit([pred1, pred2, pred3, pred4], y_val, None) weights = es.weights_ print("weights " + str(weights)) # Make sure that the estimator has "predict_proba" _test_node = DataNode(data=[X_test, None], feature_type=self.original_data.feature_types.copy()) _X_test = self.optimizer['fe'].apply(_test_node, self.local_inc['fe']).data[0] pred1 = model1_clf.predict_proba(_X_test) pred2 = model2_clf.predict_proba(X_test) pred3 = model3_clf.predict_proba(_X_test) pred4 = model4_clf.predict_proba(X_test) if is_weighted: final_pred = weights[0] * pred1 + weights[1] * pred2 + weights[2] * pred3 + weights[3] * pred4 else: final_pred = (pred1 + pred2 + pred3 + pred4) / 4 return final_pred