예제 #1
0
    def test_save_load(self):
        import time
        from deeptables.utils import fs

        filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}'
        self.dt.save(filepath)
        assert fs.exists(f'{filepath}/dt.pkl')
        assert fs.exists(f'{filepath}/dnn_nets-kfold-1.h5')
        assert fs.exists(f'{filepath}/dnn_nets-kfold-2.h5')
        assert fs.exists(f'{filepath}/dnn_nets-kfold-3.h5')
        newdt = deeptable.DeepTable.load(filepath)
        preds = newdt.predict(self.X_eval)
        assert preds.shape, (200, )
예제 #2
0
    def test_default_settings(self):
        config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True)
        dt, _ = self.run_dt(config)

        # test save and load
        filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}'
        dt.save(filepath)
        assert fs.exists(f'{filepath}/dt.pkl')
        assert fs.exists(f'{filepath}/dnn_nets.h5')
        newdt = deeptable.DeepTable.load(filepath)
        X_eval = self.df.copy()
        X_eval.pop(self.target)
        preds = newdt.predict(X_eval)
        assert preds.shape, (self.df_row_count,)
예제 #3
0
    def load(model_path):
        if not fs.exists(model_path):
            raise ValueError(f'Not found storage path: {model_path}')

        if not model_path.endswith(fs.sep):
            model_path = model_path + fs.sep

        stub_path = model_path + 'dt_estimator.pkl'
        if not fs.exists(stub_path):
            raise ValueError(
                f'Not found storage path of estimator: {stub_path}')

        with fs.open(stub_path, 'rb') as f:
            stub = pickle.load(f)

        model = DeepTable.load(model_path)
        stub.model = model

        return stub
예제 #4
0
    def fit_cross_validation(estimator_type,
                             fit_fn,
                             X,
                             y,
                             X_test=None,
                             score_fn=roc_auc_score,
                             estimator_params={},
                             categorical_feature=None,
                             task_type=consts.TASK_BINARY,
                             num_folds=5,
                             stratified=True,
                             iterators=None,
                             batch_size=None,
                             preds_filepath=None, ):
        print("Start cross validation")
        print(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}')

        # Cross validation model
        if iterators is None:
            if stratified:
                iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
            else:
                iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
        print(f'Iterators:{iterators}')

        if len(y.shape) > 1:
            oof_proba = np.zeros(y.shape)
        else:
            oof_proba = np.zeros((y.shape[0], 1))

        y = np.array(y)
        if preds_filepath is None and os.environ.get(consts.ENV_DEEPTABLES_HOME) is not None:
            preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME)
        if preds_filepath is None:
            preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/'

        if not fs.exists(preds_filepath):
            fs.makedirs(preds_filepath, exist_ok=True)

        for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
            print(f'\nFold:{n_fold + 1}\n')

            x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
            x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx]

            model = fit_fn(
                x_train_fold,
                y_train_fold,
                x_val_fold,
                y_val_fold,
                cat_vars=categorical_feature,
                task=task_type,
                estimator_params=estimator_params,
            )
            print(f'Fold {n_fold + 1} finished.')
            proba = model.predict_proba(x_val_fold)[:, 1:2]
            oof_proba[valid_idx] = proba
            test_fold_proba = model.predict_proba(X_test)
            score = round(score_fn(y_val_fold, proba), 5)
            file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv'
            with fs.open(file, 'w', encoding='utf-8') as f:
                pd.DataFrame(test_fold_proba).to_csv(f, index=False)
            print(f'Fold {n_fold + 1} Score:{score}')

        if oof_proba.shape[-1] == 1:
            oof_proba = oof_proba.reshape(-1)
        print(f'OOF score:{score_fn(y, oof_proba)}')
        return oof_proba