Exemplo n.º 1
0
 def fit(self, datanode):
     model_cnt = 0
     for algo_id in self.stats["include_algorithms"]:
         train_list = self.stats[algo_id]['train_data_list']
         configs = self.stats[algo_id]['configurations']
         for idx in range(len(train_list)):
             X, y = train_list[idx].data
             for _config in configs:
                 if self.base_model_mask[model_cnt] == 1:
                     estimator = fetch_predict_estimator(
                         self.task_type, _config, X, y)
                     with open(
                             os.path.join(
                                 self.output_dir, '%s-bagging-model%d' %
                                 (self.timestamp, model_cnt)), 'wb') as f:
                         pkl.dump(estimator, f)
                 model_cnt += 1
     return self
Exemplo n.º 2
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for idx in range(len(train_list)):
                X, y = train_list[idx].data
                for _config in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        for j, (train, test) in enumerate(kf.split(X, y)):
                            x_p1, x_p2, y_p1, _ = X[train], X[test], y[
                                train], y[test]
                            estimator = fetch_predict_estimator(
                                self.task_type, _config, x_p1, y_p1)
                            with open(
                                    os.path.join(
                                        self.output_dir, '%s-model%d_part%d' %
                                        (self.timestamp, model_cnt, j)),
                                    'wb') as f:
                                pkl.dump(estimator, f)
                            if self.task_type in CLS_TASKS:
                                pred = estimator.predict_proba(x_p2)
                                n_dim = np.array(pred).shape[1]
                                if n_dim == 2:
                                    # Binary classificaion
                                    n_dim = 1
                                # Initialize training matrix for phase 2
                                if feature_p2 is None:
                                    num_samples = len(train) + len(test)
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                if n_dim == 1:
                                    feature_p2[test,
                                               suc_cnt * n_dim:(suc_cnt + 1) *
                                               n_dim] = pred[:, 1:2]
                                else:
                                    feature_p2[test,
                                               suc_cnt * n_dim:(suc_cnt + 1) *
                                               n_dim] = pred
                            else:
                                pred = estimator.predict(x_p2).reshape(-1, 1)
                                n_dim = 1
                                # Initialize training matrix for phase 2
                                if feature_p2 is None:
                                    num_samples = len(train) + len(test)
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                feature_p2[test, suc_cnt *
                                           n_dim:(suc_cnt + 1) * n_dim] = pred
                        suc_cnt += 1
                    model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self
Exemplo n.º 3
0
    def __init__(self, stats, ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 save_model=False,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.model_cnt = 0
        self.save_model = save_model
        self.output_dir = output_dir

        self.train_predictions = []
        self.config_list = []
        self.train_data_dict = {}
        self.train_labels = None
        self.seed = self.stats['split_seed']
        self.timestamp = str(time.time())
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for idx in range(len(train_list)):
                X, y = train_list[idx].data

                # TODO: Hyperparameter
                test_size = 0.2

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed)
                else:
                    ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed)

                for train_index, test_index in ss.split(X, y):
                    X_train, X_valid = X[train_index], X[test_index]
                    y_train, y_valid = y[train_index], y[test_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                for _config in configs:
                    self.config_list.append(_config)
                    self.train_data_dict[self.model_cnt] = (X, y)
                    estimator = fetch_predict_estimator(self.task_type, _config, X_train, y_train)
                    if self.save_model:
                        with open(os.path.join(self.output_dir, '%s-model%d' % (self.timestamp, self.model_cnt)),
                                  'wb') as f:
                            pkl.dump(estimator, f)
                    if self.task_type in CLS_TASKS:
                        y_valid_pred = estimator.predict_proba(X_valid)
                    else:
                        y_valid_pred = estimator.predict(X_valid)
                    self.train_predictions.append(y_valid_pred)
                    self.model_cnt += 1
        if len(self.train_predictions) < self.ensemble_size:
            self.ensemble_size = len(self.train_predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(np.array(self.train_predictions),
                                                                     self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(np.array(self.train_predictions), np.array(y_valid),
                                                                 self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
Exemplo n.º 4
0
    def fit(self, train_data: DataNode, dataset_id=None):
        """
        this function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        # Initialize each algorithm's solver.
        for _algo in self.include_algorithms:
            self.solvers[_algo] = SecondLayerBandit(self.task_type, _algo, train_data,
                                                    metric=self.metric,
                                                    output_dir=self.output_dir,
                                                    per_run_time_limit=self.per_run_time_limit,
                                                    seed=self.seed,
                                                    eval_type=self.evaluation_type,
                                                    dataset_id=dataset_id,
                                                    n_jobs=self.n_jobs,
                                                    mth='alter_hpo')

        # Set the resource limit.
        if self.time_limit is not None:
            time_limit_per_algo = self.time_limit / len(self.include_algorithms)
            max_iter_num = 999999
        else:
            time_limit_per_algo = None
            max_iter_num = self.iter_num_per_algo

        # Optimize each algorithm with corresponding solver.
        for algo in self.include_algorithms:
            _start_time, _iter_id = time.time(), 0
            solver = self.solvers[algo]

            while _iter_id < max_iter_num:
                result = solver.play_once()
                print('optimize %s in %d-th iteration: %.3f' % (algo, _iter_id, result))
                _iter_id += 1
                if self.time_limit is not None:
                    if time.time() - _start_time >= time_limit_per_algo:
                        break
                if solver.early_stopped_flag:
                    break

        for algo_id in self.include_algorithms:
            if self.solvers[algo_id].incumbent_perf > self.best_perf:
                self.best_perf = self.solvers[algo_id].incumbent_perf
                self.best_algo_id = algo_id
        self.best_data_node = self.solvers[self.best_algo_id].inc['fe']
        self.fe_optimizer = self.solvers[self.best_algo_id].optimizer['fe']
        self.best_config = self.solvers[self.best_algo_id].inc['hpo']

        if self.ensemble_method is not None:
            self.stats = self.fetch_ensemble_members()
            # Ensembling all intermediate/ultimate models found in above optimization process.
            # TODO: version1.0, support multiple ensemble methods.
            self.es = EnsembleBuilder(stats=self.stats,
                                      ensemble_method=self.ensemble_method,
                                      ensemble_size=self.ensemble_size,
                                      task_type=self.task_type,
                                      metric=self.metric,
                                      output_dir=self.output_dir)
            self.es.fit(data=train_data)
        else:

            best_estimator = fetch_predict_estimator(self.task_type, self.best_config, self.best_data_node.data[0],
                                                     self.best_data_node.data[1])
            with open(os.path.join(self.output_dir, '%s-best_model' % str(self.timestamp)), 'wb') as f:
                pkl.dump(best_estimator, f)
Exemplo n.º 5
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for idx in range(len(train_list)):
                X, y = train_list[idx].data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X,
                        y,
                        test_size=test_size,
                        stratify=data.data[1],
                        random_state=self.seed)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(
                        X, y, test_size=test_size, random_state=self.seed)
                for _config in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        estimator = fetch_predict_estimator(
                            self.task_type, _config, x_p1, y_p1)
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-blending-model%d' %
                                    (self.timestamp, model_cnt)), 'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(x_p2)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(x_p2)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                        suc_cnt += 1
                    model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self
Exemplo n.º 6
0
    def predict_proba(self, X_test, is_weighted=False):
        """
            weight source: ...
            model 1: local_inc['fe'], default_hpo
            model 2: default_fe, local_inc['hpo']
            model 3: local_inc['fe'], local_inc['hpo']
        :param X_test:
        :param is_weighted:
        :return:
        """
        X_train_ori, y_train_ori = self.original_data.data
        X_train_inc, y_train_inc = self.local_inc['fe'].data

        model1_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_inc, y_train_inc)
        model2_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_ori, y_train_ori)
        model3_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_inc, y_train_inc)
        model4_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_ori, y_train_ori)

        if is_weighted:
            # Based on performance on the validation set
            # TODO: Save the results so that the models will not be trained again
            from automlToolkit.components.ensemble.ensemble_selection import EnsembleSelection
            from autosklearn.metrics import balanced_accuracy
            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=1)
            X, y = X_train_ori.copy(), y_train_ori.copy()
            _X, _y = X_train_inc.copy(), y_train_inc.copy()
            for train_index, test_index in sss.split(X, y):
                X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
                _X_train, _X_val, _y_train, _y_val = _X[train_index], _X[test_index], _y[train_index], _y[test_index]

            assert (y_val == _y_val).all()
            model1_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, _X_train, _y_train)
            model2_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train, y_train)
            model3_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], _X_train, _y_train)
            model4_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, X_train, y_train)
            pred1 = model1_clf_temp.predict_proba(_X_val)
            pred2 = model2_clf_temp.predict_proba(X_val)
            pred3 = model3_clf_temp.predict_proba(_X_val)
            pred4 = model4_clf_temp.predict_proba(X_val)

            # Ensemble size is a hyperparameter
            es = EnsembleSelection(ensemble_size=20, task_type=1, metric=balanced_accuracy,
                                   random_state=np.random.RandomState(self.seed))
            es.fit([pred1, pred2, pred3, pred4], y_val, None)
            weights = es.weights_
            print("weights " + str(weights))

        # Make sure that the estimator has "predict_proba"
        _test_node = DataNode(data=[X_test, None], feature_type=self.original_data.feature_types.copy())
        _X_test = self.optimizer['fe'].apply(_test_node, self.local_inc['fe']).data[0]
        pred1 = model1_clf.predict_proba(_X_test)
        pred2 = model2_clf.predict_proba(X_test)
        pred3 = model3_clf.predict_proba(_X_test)
        pred4 = model4_clf.predict_proba(X_test)

        if is_weighted:
            final_pred = weights[0] * pred1 + weights[1] * pred2 + weights[2] * pred3 + weights[3] * pred4
        else:
            final_pred = (pred1 + pred2 + pred3 + pred4) / 4

        return final_pred