def fit(self, source_df, y, experiment: ExperimentBackend) -> pd.DataFrame: X, y = source_df.values, y default_params = self.generate_default_model_parameter( X, y, experiment) with experiment.mark_time(prefix='fit'): models, oof = self.run_oof_train(X, y, default_params, experiment=experiment) self._fitted_models = models experiment.mark('n_cv', len(models)) return pd.DataFrame(oof)
def frozen(self, experiment: ExperimentBackend) -> 'MetaBlock': """ save fitted models to the experiment Args: experiment: 保存する対象となる environment Returns: myself """ if not self._check_has_fitted_models(): raise NotFittedError() dir_names = [ self._get_fold_dir(i) for i in range(len(self._fitted_models)) ] for name, model in zip(dir_names, self._fitted_models): with experiment.as_environment(name, style='nested') as fold_env: fold_env.save_as_python_object('model', model) experiment.mark('cv_dirs', dir_names) return self
def unzip(self, experiment: ExperimentBackend) -> 'MetaBlock': """load fitting models from experiment. Raises: NotFittedError there is no `cv_dirs` in marked object. fit が呼ばれた段階で, MetaBlock class は cv_dirs に cv ごとのディレクトリを保存しています. これが参照できない場合 fit が呼ばれていないと判断して `NotFittedError` を送出します Args: experiment: 読み込み対象の experiment Returns: myself """ if not self._check_has_models_in_exp(experiment): raise NotFittedError( '`cv_dirs` is not found in marked object. Must be call fit before `unzip`.' ) mark = experiment.get_marked() output_dirs = mark.get('cv_dirs', None) # type: List[str] if output_dirs is None: raise NotFittedError( '`cv_dirs` is not found in marked object. Must be call fit before `unzip`.' ) models = [] for out_dir in output_dirs: with experiment.as_environment(out_dir, style='nested') as fold_env: model = fold_env.load_object('model') models.append(model) self._fitted_models = models return self
def _fit_model(self, X: np.ndarray, y: np.ndarray, default_params: dict, validation_set: tuple, indexes_set: tuple, experiment: ExperimentBackend) -> PrePostProcessModel: """ fit a new model class. Notes: in model_params, add scaling parameters for target / input (ex. target_scaling = False) Args: X: training feature. numpy array. shape = (n_train, n_features) y: target. shape = (n_train, n_classes) default_params: parameters pass into model constructor validation_set: indexes_set: experiment: Returns: trained model """ model_params = self.get_model_params_on_each_fold( default_params, indexes_set) model = self.create_model(model_params) # MEMO: validation data are not transform so validation score is invalid (in boosting model, eval_set) model._before_fit(X, y) x_valid, y_valid = validation_set x_valid = model.input_transformer.transform(x_valid) y_valid = model.target_transformer.transform(y_valid) fit_params = self.get_fit_params_on_each_fold(model_params, training_set=(X, y), validation_set=(x_valid, y_valid), indexes_set=indexes_set, experiment=experiment) if fit_params is None: fit_params = {} with experiment.mark_time('fit'): model.fit(X, y, **fit_params) return model
def test_set_silent(backend: ExperimentBackend): with backend.silent(): assert backend.logger.disabled == True
def test_can_call_method(backend: ExperimentBackend): backend.save_object('foo', {}) backend.save_as_python_object('foo', {}) assert not backend.can_save, backend.to assert backend.get_marked() is None
import os import pytest from vivid.backends.experiments import LocalExperimentBackend, ExperimentBackend @pytest.mark.parametrize( 'backend', [ExperimentBackend(), LocalExperimentBackend()]) def test_can_call_method(backend: ExperimentBackend): backend.save_object('foo', {}) backend.save_as_python_object('foo', {}) assert not backend.can_save, backend.to assert backend.get_marked() is None @pytest.mark.parametrize( 'backend', [ExperimentBackend(), LocalExperimentBackend()]) def test_set_silent(backend: ExperimentBackend): with backend.silent(): assert backend.logger.disabled == True def test_local_mark(tmpdir): experiment = LocalExperimentBackend(tmpdir) assert experiment.can_save obj = {'bar': [1, 2]}
def frozen(self, experiment: ExperimentBackend): experiment.save_as_python_object('mapping', self.fitted_models_) return self
def unzip(self, experiment: ExperimentBackend): self.fitted_models_ = experiment.load_object('mapping') return self
def run_oof_train( self, X, y, default_params, n_max: Union[int, None] = None, experiment: Optional[ExperimentBackend] = None ) -> ([List[PrePostProcessModel], np.ndarray]): """ main training loop. Args: X: training array. y: target array default_params: model parameter using by default. pass to model constructor (not fit) If you change fit parameter like `eval_metric`, override get_fit_params_on_each_fold. n_max: Number of fold to fit. If set None, learn for all folds. If set number, stop fit model reach to the value. * if n_fold = None, run all folds * if n_fold = 1, stop one fold. * if n_fold > num_cv, run all folds * if n_fold <= 0, no fold run, return empty list and zero vector out-of-fold Returns: list of fitted models and out-of-fold numpy array. out-of-fold: shape = (n_train, output_dim) """ if self.is_regression_model: self._output_dim = 1 else: le = LabelEncoder() le.fit(y) n_classes = len(le.classes_) self._output_dim = 1 if n_classes == 2 else n_classes oof = np.zeros(shape=(len(y), self._output_dim), dtype=np.float32) splits = self.get_fold_splitting(X, y) models = [] if experiment is None: experiment = ExperimentBackend() self.n_splits_ = len(splits) for i, (idx_train, idx_valid) in enumerate(splits): with experiment.as_environment(self._get_fold_dir(i), style='nested') as exp_i: if n_max is not None and i >= max(0, n_max): exp_i.logger.info(f'Stop K-Fold at {i}') break exp_i.logger.info('start k-fold: {}/{}'.format( i, self.n_splits_)) X_i, y_i = X[idx_train], y[idx_train] X_valid, y_valid = X[idx_valid], y[idx_valid] clf = self._fit_model(X_i, y_i, default_params=default_params, validation_set=(X_valid, y_valid), indexes_set=(idx_train, idx_valid), experiment=exp_i) pred_i = run_predict(clf, X_valid, is_regression=self.is_regression_model) oof[idx_valid] = pred_i models.append(clf) calculator = self.get_calculate_metrics() metric = calculator(y_valid, pred_i) exp_i.mark('metrics', metric) for l in to_pretty_lines(metric): exp_i.logger.info(l) exp_i.mark('model_params', clf.get_params(deep=True)) exp_i.mark('n_fold', i) exp_i.mark('split_info', { 'train_shape': idx_train.sum(), 'valid_shape': idx_valid.sum() }) return models, oof
def _check_has_models_in_exp(self, experiment: ExperimentBackend) -> bool: try: output_dirs = experiment.get_marked().get('cv_dirs', None) except (FileNotFoundError, AttributeError): return False return output_dirs is not None