Пример #1
0
    def compute_feature_importance(self,
                                   X,
                                   y,
                                   features_to_use=None,
                                   preprocess=True,
                                   is_oof=True,
                                   silent=False,
                                   **kwargs) -> pd.Series:
        feature_importance_fold_list = []
        fold_weights = []
        # TODO: Preprocess data here instead of repeatedly
        model_index = 0
        for n_repeat, k in enumerate(self._k_per_n_repeat):
            if is_oof:
                if not self.bagged_mode:
                    raise AssertionError(
                        'Model trained with no validation data cannot get feature importances on training data, please specify new test data to compute feature importances (model=%s)'
                        % self.name)
                kfolds = generate_kfold(X=X,
                                        y=y,
                                        n_splits=k,
                                        stratified=self.is_stratified(),
                                        random_state=self._random_state,
                                        n_repeats=n_repeat + 1)
                cur_kfolds = kfolds[n_repeat * k:(n_repeat + 1) * k]
            else:
                cur_kfolds = [(None, list(range(len(X))))] * k
            for i, fold in enumerate(cur_kfolds):
                _, test_index = fold
                model = self.load_child(self.models[model_index + i])
                feature_importance_fold = model.compute_feature_importance(
                    X=X.iloc[test_index, :],
                    y=y.iloc[test_index],
                    features_to_use=features_to_use,
                    preprocess=preprocess,
                    silent=silent,
                    **kwargs)
                feature_importance_fold_list.append(feature_importance_fold)
                fold_weights.append(len(test_index))
            model_index += k

        weight_total = sum(fold_weights)
        fold_weights = [weight / weight_total for weight in fold_weights]

        for i, result in enumerate(feature_importance_fold_list):
            feature_importance_fold_list[
                i] = feature_importance_fold_list[i] * fold_weights[i]

        feature_importance = pd.concat(
            feature_importance_fold_list, axis=1,
            sort=True).sum(1).sort_values(ascending=False)

        # TODO: Consider utilizing z scores and stddev to make threshold decisions
        # stddev = pd.concat(feature_importance_fold_list, axis=1, sort=True).std(1).sort_values(ascending=False)
        # feature_importance_df = pd.DataFrame(index=feature_importance.index)
        # feature_importance_df['importance'] = feature_importance
        # feature_importance_df['stddev'] = stddev
        # feature_importance_df['z'] = feature_importance_df['importance'] / feature_importance_df['stddev']

        return feature_importance
Пример #2
0
    def _hyperparameter_tune(self,
                             X_train,
                             y_train,
                             k_fold,
                             scheduler_options=None,
                             preprocess_kwargs=None,
                             **kwargs):
        if len(self.models) != 0:
            raise ValueError(
                'self.models must be empty to call hyperparameter_tune, value: %s'
                % self.models)

        self.model_base.feature_metadata = self.feature_metadata  # TODO: Move this

        # TODO: Preprocess data here instead of repeatedly
        if preprocess_kwargs is None:
            preprocess_kwargs = dict()
        X_train = self.preprocess(X=X_train,
                                  preprocess=False,
                                  fit=True,
                                  **preprocess_kwargs)
        kfolds = generate_kfold(X=X_train,
                                y=y_train,
                                n_splits=k_fold,
                                stratified=self.is_stratified(),
                                random_state=self._random_state,
                                n_repeats=1)

        train_index, test_index = kfolds[0]
        X_train_fold, X_val_fold = X_train.iloc[train_index, :], X_train.iloc[
            test_index, :]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[
            test_index]
        orig_time = scheduler_options[1]['time_out']
        scheduler_options[1][
            'time_out'] = orig_time * 0.8  # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop
        hpo_models, hpo_model_performances, hpo_results = self.model_base.hyperparameter_tune(
            X_train=X_train_fold,
            y_train=y_train_fold,
            X_val=X_val_fold,
            y_val=y_val_fold,
            scheduler_options=scheduler_options,
            **kwargs)
        scheduler_options[1]['time_out'] = orig_time

        bags = {}
        bags_performance = {}
        for i, (model_name, model_path) in enumerate(hpo_models.items()):
            child: AbstractModel = self._child_type.load(path=model_path)
            y_pred_proba = child.predict_proba(X_val_fold)

            # TODO: Create new Ensemble Here
            bag = copy.deepcopy(self)
            bag.name = bag.name + os.path.sep + str(i)
            bag.set_contexts(self.path_root + bag.name + os.path.sep)

            oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(
                X=X_train, y=y_train)
            oof_pred_proba[test_index] += y_pred_proba
            oof_pred_model_repeats[test_index] += 1

            bag.model_base = None
            child.set_contexts(bag.path + child.name + os.path.sep)
            bag.save_model_base(child.convert_to_template())

            bag._k = k_fold
            bag._k_fold_end = 1
            bag._n_repeats = 1
            bag._oof_pred_proba = oof_pred_proba
            bag._oof_pred_model_repeats = oof_pred_model_repeats
            child.name = child.name + '_fold_0'
            child.set_contexts(bag.path + child.name + os.path.sep)
            if not self.save_bagged_folds:
                child.model = None
            if bag.low_memory:
                bag.save_child(child, verbose=False)
                bag.models.append(child.name)
            else:
                bag.models.append(child)
            bag.val_score = child.val_score
            bag._add_child_times_to_bag(model=child)

            bag.save()
            bags[bag.name] = bag.path
            bags_performance[bag.name] = bag.val_score

        # TODO: hpo_results likely not correct because no renames
        return bags, bags_performance, hpo_results
Пример #3
0
    def compute_feature_importance(self,
                                   X,
                                   y,
                                   features=None,
                                   is_oof=True,
                                   time_limit=None,
                                   silent=False,
                                   **kwargs) -> pd.DataFrame:
        if features is None:
            features = self.load_child(model=self.models[0]).features
        if not is_oof:
            return super().compute_feature_importance(X,
                                                      y,
                                                      features=features,
                                                      time_limit=time_limit,
                                                      silent=silent,
                                                      **kwargs)
        fi_fold_list = []
        model_index = 0
        num_children = len(self.models)
        if time_limit is not None:
            time_limit_per_child = time_limit / num_children
        else:
            time_limit_per_child = None
        if not silent:
            logging_message = f'Computing feature importance via permutation shuffling for {len(features)} features using out-of-fold (OOF) data aggregated across {num_children} child models...'
            if time_limit is not None:
                logging_message = f'{logging_message} Time limit: {time_limit}s...'
            logger.log(20, logging_message)

        time_start = time.time()
        early_stop = False
        children_completed = 0
        log_final_suffix = ''
        for n_repeat, k in enumerate(self._k_per_n_repeat):
            if is_oof:
                if not self.bagged_mode:
                    raise AssertionError(
                        'Model trained with no validation data cannot get feature importances on training data, please specify new test data to compute feature importances (model=%s)'
                        % self.name)
                kfolds = generate_kfold(X=X,
                                        y=y,
                                        n_splits=k,
                                        stratified=self.is_stratified(),
                                        random_state=self._random_state,
                                        n_repeats=n_repeat + 1)
                cur_kfolds = kfolds[n_repeat * k:(n_repeat + 1) * k]
            else:
                cur_kfolds = [(None, list(range(len(X))))] * k
            for i, fold in enumerate(cur_kfolds):
                _, test_index = fold
                model = self.load_child(self.models[model_index + i])
                fi_fold = model.compute_feature_importance(
                    X=X.iloc[test_index, :],
                    y=y.iloc[test_index],
                    features=features,
                    time_limit=time_limit_per_child,
                    silent=silent,
                    log_prefix='\t',
                    importance_as_list=True,
                    **kwargs)
                fi_fold_list.append(fi_fold)

                children_completed += 1
                if time_limit is not None and children_completed != num_children:
                    time_now = time.time()
                    time_left = time_limit - (time_now - time_start)
                    time_child_average = (time_now -
                                          time_start) / children_completed
                    if time_left < (time_child_average * 1.1):
                        log_final_suffix = f' (Early stopping due to lack of time...)'
                        early_stop = True
                        break
            if early_stop:
                break
            model_index += k
        # TODO: DON'T THROW AWAY SAMPLES! USE LARGER N
        fi_list_dict = dict()
        for val in fi_fold_list:
            val = val['importance'].to_dict(
            )  # TODO: Don't throw away stddev information of children
            for key in val:
                if key not in fi_list_dict:
                    fi_list_dict[key] = []
                fi_list_dict[key] += val[key]
        fi_df = _compute_fi_with_stddev(fi_list_dict)

        if not silent:
            logger.log(
                20,
                f'\t{round(time.time() - time_start, 2)}s\t= Actual runtime (Completed {children_completed} of {num_children} children){log_final_suffix}'
            )

        return fi_df
Пример #4
0
    def _fit(self,
             X_train,
             y_train,
             k_fold=5,
             k_fold_start=0,
             k_fold_end=None,
             n_repeats=1,
             n_repeat_start=0,
             time_limit=None,
             **kwargs):
        if k_fold < 1:
            k_fold = 1
        if k_fold_end is None:
            k_fold_end = k_fold

        if self._oof_pred_proba is None and (k_fold_start != 0
                                             or n_repeat_start != 0):
            self._load_oof()
        if n_repeat_start != self._n_repeats_finished:
            raise ValueError(
                f'n_repeat_start must equal self._n_repeats_finished, values: ({n_repeat_start}, {self._n_repeats_finished})'
            )
        if n_repeats <= n_repeat_start:
            raise ValueError(
                f'n_repeats must be greater than n_repeat_start, values: ({n_repeats}, {n_repeat_start})'
            )
        if k_fold_start != self._k_fold_end:
            raise ValueError(
                f'k_fold_start must equal previous k_fold_end, values: ({k_fold_start}, {self._k_fold_end})'
            )
        if k_fold_start >= k_fold_end:
            # TODO: Remove this limitation if n_repeats > 1
            raise ValueError(
                f'k_fold_end must be greater than k_fold_start, values: ({k_fold_end}, {k_fold_start})'
            )
        if (n_repeats - n_repeat_start) > 1 and k_fold_end != k_fold:
            # TODO: Remove this limitation
            raise ValueError(
                f'k_fold_end must equal k_fold when (n_repeats - n_repeat_start) > 1, values: ({k_fold_end}, {k_fold})'
            )
        if self._k is not None and self._k != k_fold:
            raise ValueError(
                f'k_fold must equal previously fit k_fold value for the current n_repeat, values: (({k_fold}, {self._k})'
            )
        fold_start = n_repeat_start * k_fold + k_fold_start
        fold_end = (n_repeats - 1) * k_fold + k_fold_end
        time_start = time.time()

        model_base = self._get_model_base()
        if self.features is not None:
            model_base.features = self.features
        model_base.feature_metadata = self.feature_metadata  # TODO: Don't pass this here

        if self.model_base is not None:
            self.save_model_base(self.model_base)
            self.model_base = None

        if k_fold == 1:
            if self._n_repeats != 0:
                raise ValueError(
                    f'n_repeats must equal 0 when fitting a single model with k_fold < 2, values: ({self._n_repeats}, {k_fold})'
                )
            model_base.set_contexts(path_context=self.path + model_base.name +
                                    os.path.sep)
            time_start_fit = time.time()
            model_base.fit(X_train=X_train,
                           y_train=y_train,
                           time_limit=time_limit,
                           **kwargs)
            model_base.fit_time = time.time() - time_start_fit
            model_base.predict_time = None
            self._oof_pred_proba = model_base.predict_proba(
                X=X_train)  # TODO: Cheater value, will be overfit to valid set
            self._oof_pred_model_repeats = np.ones(shape=len(X_train),
                                                   dtype=np.uint8)
            self._n_repeats = 1
            self._n_repeats_finished = 1
            self._k_per_n_repeat = [1]
            self.bagged_mode = False
            model_base.reduce_memory_size(remove_fit=True,
                                          remove_info=False,
                                          requires_save=True)
            if not self.save_bagged_folds:
                model_base.model = None
            if self.low_memory:
                self.save_child(model_base, verbose=False)
                self.models = [model_base.name]
            else:
                self.models = [model_base]
            self._add_child_times_to_bag(model=model_base)
            return

        # TODO: Preprocess data here instead of repeatedly
        kfolds = generate_kfold(X=X_train,
                                y=y_train,
                                n_splits=k_fold,
                                stratified=self.is_stratified(),
                                random_state=self._random_state,
                                n_repeats=n_repeats)

        oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(
            X=X_train, y=y_train)

        models = []
        folds_to_fit = fold_end - fold_start
        for j in range(n_repeat_start, n_repeats):  # For each n_repeat
            cur_repeat_count = j - n_repeat_start
            fold_start_n_repeat = fold_start + cur_repeat_count * k_fold
            fold_end_n_repeat = min(fold_start_n_repeat + k_fold, fold_end)
            # TODO: Consider moving model fit inner for loop to a function to simply this code
            for i in range(fold_start_n_repeat,
                           fold_end_n_repeat):  # For each fold
                folds_finished = i - fold_start
                folds_left = fold_end - i
                fold = kfolds[i]
                time_elapsed = time.time() - time_start
                if time_limit is not None:
                    time_left = time_limit - time_elapsed
                    required_time_per_fold = time_left / folds_left
                    time_limit_fold = required_time_per_fold * 0.8
                    if folds_finished > 0:
                        expected_time_required = time_elapsed * folds_to_fit / folds_finished
                        expected_remaining_time_required = expected_time_required * folds_left / folds_to_fit
                        if expected_remaining_time_required > time_left:
                            raise TimeLimitExceeded
                    if time_left <= 0:
                        raise TimeLimitExceeded
                else:
                    time_limit_fold = None

                time_start_fold = time.time()
                train_index, val_index = fold
                X_train_fold, X_val_fold = X_train.iloc[
                    train_index, :], X_train.iloc[val_index, :]
                y_train_fold, y_val_fold = y_train.iloc[
                    train_index], y_train.iloc[val_index]
                fold_model = copy.deepcopy(model_base)
                fold_model.name = f'{fold_model.name}_fold_{i}'
                fold_model.set_contexts(self.path + fold_model.name +
                                        os.path.sep)
                fold_model.fit(X_train=X_train_fold,
                               y_train=y_train_fold,
                               X_val=X_val_fold,
                               y_val=y_val_fold,
                               time_limit=time_limit_fold,
                               **kwargs)
                time_train_end_fold = time.time()
                if time_limit is not None:  # Check to avoid unnecessarily predicting and saving a model when an Exception is going to be raised later
                    if i != (fold_end - 1):
                        time_elapsed = time.time() - time_start
                        time_left = time_limit - time_elapsed
                        expected_time_required = time_elapsed * folds_to_fit / (
                            folds_finished + 1)
                        expected_remaining_time_required = expected_time_required * (
                            folds_left - 1) / folds_to_fit
                        if expected_remaining_time_required > time_left:
                            raise TimeLimitExceeded
                pred_proba = fold_model.predict_proba(X_val_fold)
                time_predict_end_fold = time.time()
                fold_model.fit_time = time_train_end_fold - time_start_fold
                fold_model.predict_time = time_predict_end_fold - time_train_end_fold
                fold_model.val_score = fold_model.score_with_y_pred_proba(
                    y=y_val_fold, y_pred_proba=pred_proba)
                fold_model.reduce_memory_size(remove_fit=True,
                                              remove_info=False,
                                              requires_save=True)
                if not self.save_bagged_folds:
                    fold_model.model = None
                if self.low_memory:
                    self.save_child(fold_model, verbose=False)
                    models.append(fold_model.name)
                else:
                    models.append(fold_model)
                oof_pred_proba[val_index] += pred_proba
                oof_pred_model_repeats[val_index] += 1
                self._add_child_times_to_bag(model=fold_model)
            if (fold_end_n_repeat != fold_end) or (k_fold == k_fold_end):
                self._k_per_n_repeat.append(k_fold)
        self.models += models

        self.bagged_mode = True

        if self._oof_pred_proba is None:
            self._oof_pred_proba = oof_pred_proba
            self._oof_pred_model_repeats = oof_pred_model_repeats
        else:
            self._oof_pred_proba += oof_pred_proba
            self._oof_pred_model_repeats += oof_pred_model_repeats

        self._n_repeats = n_repeats
        if k_fold == k_fold_end:
            self._k = None
            self._k_fold_end = 0
            self._n_repeats_finished = self._n_repeats
        else:
            self._k = k_fold
            self._k_fold_end = k_fold_end
            self._n_repeats_finished = self._n_repeats - 1
Пример #5
0
    def hyperparameter_tune(self, X, y, k_fold, scheduler_options=None, compute_base_preds=True, **kwargs):
        if len(self.models) != 0:
            raise ValueError('self.models must be empty to call hyperparameter_tune, value: %s' % self.models)

        if len(self.models) == 0:
            type_map_raw = {column: R_FLOAT for column in self.stack_columns}
            type_group_map_special = {S_STACK: self.stack_columns}
            stacker_feature_metadata = FeatureMetadata(type_map_raw=type_map_raw, type_group_map_special=type_group_map_special)
            if self.feature_metadata is None:  # TODO: This is probably not the best way to do this
                self.feature_metadata = stacker_feature_metadata
            else:
                self.feature_metadata = self.feature_metadata.join_metadata(stacker_feature_metadata)
        self.model_base.feature_metadata = self.feature_metadata  # TODO: Move this

        # TODO: Preprocess data here instead of repeatedly
        X = self.preprocess(X=X, preprocess=False, fit=True, compute_base_preds=compute_base_preds)
        kfolds = generate_kfold(X=X, y=y, n_splits=k_fold, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=1)

        train_index, test_index = kfolds[0]
        X_train, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        orig_time = scheduler_options[1]['time_out']
        scheduler_options[1]['time_out'] = orig_time * 0.8  # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop
        hpo_models, hpo_model_performances, hpo_results = self.model_base.hyperparameter_tune(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, scheduler_options=scheduler_options, **kwargs)
        scheduler_options[1]['time_out'] = orig_time

        stackers = {}
        stackers_performance = {}
        for i, (model_name, model_path) in enumerate(hpo_models.items()):
            child: AbstractModel = self._child_type.load(path=model_path)
            y_pred_proba = child.predict_proba(X_val)

            # TODO: Create new StackerEnsemble Here
            stacker = copy.deepcopy(self)
            stacker.name = stacker.name + os.path.sep + str(i)
            stacker.set_contexts(self.path_root + stacker.name + os.path.sep)

            if self.problem_type == MULTICLASS:
                oof_pred_proba = np.zeros(shape=(len(X), len(y.unique())))
            else:
                oof_pred_proba = np.zeros(shape=len(X))
            oof_pred_model_repeats = np.zeros(shape=len(X))
            oof_pred_proba[test_index] += y_pred_proba
            oof_pred_model_repeats[test_index] += 1

            stacker.model_base = None
            child.set_contexts(stacker.path + child.name + os.path.sep)
            stacker.save_model_base(child.convert_to_template())

            stacker._k = k_fold
            stacker._k_fold_end = 1
            stacker._n_repeats = 1
            stacker._oof_pred_proba = oof_pred_proba
            stacker._oof_pred_model_repeats = oof_pred_model_repeats
            child.name = child.name + '_fold_0'
            child.set_contexts(stacker.path + child.name + os.path.sep)
            if not self.save_bagged_folds:
                child.model = None
            if stacker.low_memory:
                stacker.save_child(child, verbose=False)
                stacker.models.append(child.name)
            else:
                stacker.models.append(child)
            stacker.val_score = child.val_score
            stacker._add_child_times_to_bag(model=child)

            stacker.save()
            stackers[stacker.name] = stacker.path
            stackers_performance[stacker.name] = stacker.val_score

        # TODO: hpo_results likely not correct because no renames
        return stackers, stackers_performance, hpo_results