예제 #1
0
    def fit(self, source_df, y, experiment: ExperimentBackend) -> pd.DataFrame:
        X, y = source_df.values, y
        default_params = self.generate_default_model_parameter(
            X, y, experiment)
        with experiment.mark_time(prefix='fit'):
            models, oof = self.run_oof_train(X,
                                             y,
                                             default_params,
                                             experiment=experiment)

        self._fitted_models = models
        experiment.mark('n_cv', len(models))
        return pd.DataFrame(oof)
예제 #2
0
    def frozen(self, experiment: ExperimentBackend) -> 'MetaBlock':
        """
        save fitted models to the experiment

        Args:
            experiment:
                保存する対象となる environment
        Returns:
            myself
        """
        if not self._check_has_fitted_models():
            raise NotFittedError()
        dir_names = [
            self._get_fold_dir(i) for i in range(len(self._fitted_models))
        ]
        for name, model in zip(dir_names, self._fitted_models):
            with experiment.as_environment(name, style='nested') as fold_env:
                fold_env.save_as_python_object('model', model)
        experiment.mark('cv_dirs', dir_names)
        return self
예제 #3
0
    def unzip(self, experiment: ExperimentBackend) -> 'MetaBlock':
        """load fitting models from experiment.

        Raises:
            NotFittedError
                there is no `cv_dirs` in marked object.
                fit が呼ばれた段階で, MetaBlock class は cv_dirs に cv ごとのディレクトリを保存しています.
                これが参照できない場合 fit が呼ばれていないと判断して `NotFittedError` を送出します

        Args:
            experiment:
                読み込み対象の experiment

        Returns:
            myself
        """
        if not self._check_has_models_in_exp(experiment):
            raise NotFittedError(
                '`cv_dirs` is not found in marked object. Must be call fit before `unzip`.'
            )
        mark = experiment.get_marked()
        output_dirs = mark.get('cv_dirs', None)  # type: List[str]
        if output_dirs is None:
            raise NotFittedError(
                '`cv_dirs` is not found in marked object. Must be call fit before `unzip`.'
            )

        models = []
        for out_dir in output_dirs:
            with experiment.as_environment(out_dir,
                                           style='nested') as fold_env:
                model = fold_env.load_object('model')
            models.append(model)

        self._fitted_models = models
        return self
예제 #4
0
    def _fit_model(self, X: np.ndarray, y: np.ndarray, default_params: dict,
                   validation_set: tuple, indexes_set: tuple,
                   experiment: ExperimentBackend) -> PrePostProcessModel:
        """
        fit a new model class.

        Notes:
            in model_params, add scaling parameters for target / input (ex. target_scaling = False)

        Args:
            X: training feature. numpy array. shape = (n_train, n_features)
            y: target. shape = (n_train, n_classes)
            default_params: parameters pass into model constructor
            validation_set:
            indexes_set:
            experiment:

        Returns:
            trained model
        """
        model_params = self.get_model_params_on_each_fold(
            default_params, indexes_set)
        model = self.create_model(model_params)

        # MEMO: validation data are not transform so validation score is invalid (in boosting model, eval_set)
        model._before_fit(X, y)
        x_valid, y_valid = validation_set
        x_valid = model.input_transformer.transform(x_valid)
        y_valid = model.target_transformer.transform(y_valid)

        fit_params = self.get_fit_params_on_each_fold(model_params,
                                                      training_set=(X, y),
                                                      validation_set=(x_valid,
                                                                      y_valid),
                                                      indexes_set=indexes_set,
                                                      experiment=experiment)
        if fit_params is None:
            fit_params = {}

        with experiment.mark_time('fit'):
            model.fit(X, y, **fit_params)
        return model
예제 #5
0
def test_set_silent(backend: ExperimentBackend):
    with backend.silent():
        assert backend.logger.disabled == True
예제 #6
0
def test_can_call_method(backend: ExperimentBackend):
    backend.save_object('foo', {})
    backend.save_as_python_object('foo', {})

    assert not backend.can_save, backend.to
    assert backend.get_marked() is None
예제 #7
0
import os

import pytest

from vivid.backends.experiments import LocalExperimentBackend, ExperimentBackend


@pytest.mark.parametrize(
    'backend',
    [ExperimentBackend(), LocalExperimentBackend()])
def test_can_call_method(backend: ExperimentBackend):
    backend.save_object('foo', {})
    backend.save_as_python_object('foo', {})

    assert not backend.can_save, backend.to
    assert backend.get_marked() is None


@pytest.mark.parametrize(
    'backend',
    [ExperimentBackend(), LocalExperimentBackend()])
def test_set_silent(backend: ExperimentBackend):
    with backend.silent():
        assert backend.logger.disabled == True


def test_local_mark(tmpdir):
    experiment = LocalExperimentBackend(tmpdir)
    assert experiment.can_save

    obj = {'bar': [1, 2]}
예제 #8
0
파일: base.py 프로젝트: nyk510/vivid
 def frozen(self, experiment: ExperimentBackend):
     experiment.save_as_python_object('mapping', self.fitted_models_)
     return self
예제 #9
0
파일: base.py 프로젝트: nyk510/vivid
 def unzip(self, experiment: ExperimentBackend):
     self.fitted_models_ = experiment.load_object('mapping')
     return self
예제 #10
0
    def run_oof_train(
        self,
        X,
        y,
        default_params,
        n_max: Union[int, None] = None,
        experiment: Optional[ExperimentBackend] = None
    ) -> ([List[PrePostProcessModel], np.ndarray]):
        """
        main training loop.

        Args:
            X:
                training array.
            y:
                target array
            default_params:
                model parameter using by default. pass to model constructor (not fit)
                If you change fit parameter like `eval_metric`, override get_fit_params_on_each_fold.
            n_max:
                Number of fold to fit. If set None, learn for all folds.
                If set number, stop fit model reach to the value.
                    * if n_fold = None, run all folds
                    * if n_fold = 1, stop one fold.
                    * if n_fold > num_cv, run all folds
                    * if n_fold <= 0, no fold run, return empty list and zero vector out-of-fold

        Returns:
            list of fitted models and out-of-fold numpy array.

            out-of-fold: shape = (n_train, output_dim)

        """
        if self.is_regression_model:
            self._output_dim = 1
        else:
            le = LabelEncoder()
            le.fit(y)
            n_classes = len(le.classes_)
            self._output_dim = 1 if n_classes == 2 else n_classes

        oof = np.zeros(shape=(len(y), self._output_dim), dtype=np.float32)

        splits = self.get_fold_splitting(X, y)
        models = []
        if experiment is None:
            experiment = ExperimentBackend()
        self.n_splits_ = len(splits)

        for i, (idx_train, idx_valid) in enumerate(splits):

            with experiment.as_environment(self._get_fold_dir(i),
                                           style='nested') as exp_i:
                if n_max is not None and i >= max(0, n_max):
                    exp_i.logger.info(f'Stop K-Fold at {i}')
                    break

                exp_i.logger.info('start k-fold: {}/{}'.format(
                    i, self.n_splits_))

                X_i, y_i = X[idx_train], y[idx_train]
                X_valid, y_valid = X[idx_valid], y[idx_valid]

                clf = self._fit_model(X_i,
                                      y_i,
                                      default_params=default_params,
                                      validation_set=(X_valid, y_valid),
                                      indexes_set=(idx_train, idx_valid),
                                      experiment=exp_i)

                pred_i = run_predict(clf,
                                     X_valid,
                                     is_regression=self.is_regression_model)
                oof[idx_valid] = pred_i
                models.append(clf)

                calculator = self.get_calculate_metrics()
                metric = calculator(y_valid, pred_i)
                exp_i.mark('metrics', metric)
                for l in to_pretty_lines(metric):
                    exp_i.logger.info(l)

                exp_i.mark('model_params', clf.get_params(deep=True))
                exp_i.mark('n_fold', i)
                exp_i.mark('split_info', {
                    'train_shape': idx_train.sum(),
                    'valid_shape': idx_valid.sum()
                })
        return models, oof
예제 #11
0
 def _check_has_models_in_exp(self, experiment: ExperimentBackend) -> bool:
     try:
         output_dirs = experiment.get_marked().get('cv_dirs', None)
     except (FileNotFoundError, AttributeError):
         return False
     return output_dirs is not None