def save(self, model_path): if not model_path.endswith(fs.sep): model_path = model_path + fs.sep if not fs.exists(model_path): fs.mkdirs(model_path, exist_ok=True) stub = copy.copy(self) estimators = self.estimators if estimators is not None: stub.estimators = [None for _ in estimators] # keep size if estimators is not None: for i, est in enumerate(estimators): est_pkl = f'{model_path}{i}.pkl' est_model = f'{model_path}{i}.model' for t in [est_pkl, est_model]: if fs.exists(t): fs.rm(t) if est is None: continue with fs.open(est_pkl, 'wb') as f: pickle.dump(est, f, protocol=pickle.HIGHEST_PROTOCOL) if hasattr(est, 'save') and hasattr(est, 'load'): est.save(est_model) with fs.open(f'{model_path}ensemble.pkl', 'wb') as f: pickle.dump(stub, f, protocol=pickle.HIGHEST_PROTOCOL)
def test_save_load(self): import time from hypernets.utils import fs filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}' self.dt.save(filepath) assert fs.exists(f'{filepath}/dt.pkl') assert fs.exists(f'{filepath}/dnn_nets.h5') newdt = deeptable.DeepTable.load(filepath) print(newdt.config) preds = newdt.predict(self.X_test) assert preds.shape, (200, )
def _prepare_cache_dir(self, cache_home, clear_cache=False): if cache_home is None: cache_home = 'cache' if cache_home[-1] == '/': cache_home = cache_home[:-1] cache_home = os.path.expanduser(f'{cache_home}') if not fs.exists(cache_home): fs.makedirs(cache_home, exist_ok=True) else: if clear_cache: fs.rm(cache_home, recursive=True) fs.mkdirs(cache_home, exist_ok=True) cache_dir = f'{cache_home}/{self.signature}' if not fs.exists(cache_dir): fs.makedirs(cache_dir, exist_ok=True) return cache_dir
def test_dataframe_fs(self): file_path = f'/{type(self).__name__}/test_df_fs.parquet' df = dsutils.load_bank() p.store(df, file_path, filesystem=fs) assert fs.exists(file_path) # read it df_read = p.load(file_path, filesystem=fs) assert self.is_same_df(df, df_read)
def load(model_path): if not model_path.endswith(fs.sep): model_path = model_path + fs.sep with fs.open(f'{model_path}ensemble.pkl', 'rb') as f: stub = pickle.load(f) if stub.estimators is not None: for i in range(len(stub.estimators)): if fs.exists(f'{model_path}{i}.pkl'): with fs.open(f'{model_path}{i}.pkl', 'rb') as f: est = pickle.load(f) if fs.exists(f'{model_path}{i}.model') and hasattr( est, 'load'): est = est.load(f'{model_path}{i}.model') stub.estimators[i] = est return stub
def clear(cache_dir=None, fn=None): assert fn is None or callable(fn) if cache_dir is None: cache_dir = cfg.cache_dir if callable(fn): cache_dir = f'{cache_dir}{fs.sep}{".".join([fn.__module__, fn.__qualname__])}' if fs.exists(cache_dir): fs.rm(cache_dir, recursive=True) fs.mkdirs(cache_dir, exist_ok=True)
def _prepare_output_dir(self, home_dir, nets): if home_dir is None: home_dir = 'dt_output' if home_dir[-1] == '/': home_dir = home_dir[:-1] running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}' output_path = os.path.expanduser(f'{home_dir}/{running_dir}/') if not fs.exists(output_path): fs.makedirs(output_path, exist_ok=True) return output_path
def load_transformers_from_cache(self): transformer_path = f'{self.cache_dir}/transformers.pkl' if fs.exists(transformer_path): try: with fs.open(transformer_path, 'rb') as input: preprocessor = pickle.load(input) self.__dict__.update(preprocessor.__dict__) return True except Exception as e: logger.error(e) fs.rm(transformer_path) return False
def load_deepmodel(self, filepath): if fs.exists(filepath): print(f'Load model from: {filepath}.') dm = DeepModel(self.task, self.num_classes, self.config, self.preprocessor.categorical_columns, self.preprocessor.continuous_columns, model_file=filepath) return dm else: raise ValueError(f'Invalid model filename:{filepath}.')
def save_transformed_X_y_to_cache(self, sign, X, y): filepath = f'{self.cache_dir}/X_y_{sign}.pkl.gz' try: # x_t = X.copy(deep=True) X.insert(0, 'saved__y__', y) with fs.open(filepath, mode='wb') as f: X.to_pickle(f, compression='gzip') return True except Exception as e: logger.error(e) if fs.exists(filepath): fs.rm(filepath) return False
def get_transformed_X_y_from_cache(self, sign): file_x_y = f'{self.cache_dir}/X_y_{sign}.pkl.gz' X_t, y_t = None, None if fs.exists(file_x_y): try: with fs.open(file_x_y, mode='rb') as f: df = pd.read_pickle(f, compression='gzip') y_t = df.pop('saved__y__') X_t = df except Exception as e: logger.error(e) fs.rm(file_x_y) return X_t, y_t
def save(self, filepath, deepmodel_basename=None): if filepath[-1] != '/': filepath = filepath + '/' if not fs.exists(filepath): fs.makedirs(filepath, exist_ok=True) num_model = len(self.__modelset.get_modelinfos()) for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm if not isinstance(mi.model, DeepModel): raise ValueError( f'Currently does not support saving non-DeepModel models.') if num_model == 1 and deepmodel_basename is not None: mi.name = deepmodel_basename self.__current_model = deepmodel_basename modelfile = f'{filepath}{mi.name}.h5' mi.model.save(modelfile) mi.model = modelfile with fs.open(f'{filepath}dt.pkl', 'wb') as output: pickle.dump(self, output, protocol=4)
def _cache_call(*args, **kwargs): assert len(args) > 0 obj = None cache_path = None loaded = False result = None tb = _get_tool_box_for_cache(*args, **kwargs) try: for c in callbacks: c.on_enter(fn, *args, **kwargs) # bind arguments bind_args = sig.bind(*args, **kwargs) bind_args.apply_defaults() obj = bind_args.arguments.get('self', None) # calc cache_key key_items = {} arg_kwargs = bind_args.arguments.get('kwargs', {}).copy() arg_items = { k: v for k, v in bind_args.arguments.items() if k not in [ 'self', ] } # as dict arg_items.update(arg_kwargs) if arg_keys is not None and len(arg_keys) > 0: key_items.update({k: arg_items.get(k) for k in arg_keys}) else: key_items.update(arg_items) if attr_keys is not None: key_items.update({k: getattr(obj, k, None) for k in attr_keys}) elif isinstance(obj, BaseEstimator) and 'params_' not in key_items: key_items['params_'] = obj.get_params(deep=False) if attrs_to_restore is not None: key_items['attrs_to_restore_'] = attrs_to_restore cache_key = tb.data_hasher()(key_items) # join cache_path if not fs.exists(cache_dir): fs.mkdirs(cache_dir, exist_ok=True) cache_path = f'{cache_dir}{fs.sep}{cache_key}' # detect and load cache if fs.exists(f'{cache_path}.meta'): # load cached_data, meta = _load_cache(tb, cache_path) for c in callbacks: c.on_apply(fn, cached_data, *args, **kwargs) # restore attributes if attrs_to_restore is not None: cached_attributes = meta.get('attributes', {}) for k in attrs_to_restore: setattr(obj, k, cached_attributes.get(k)) if meta['strategy'] == _STRATEGY_DATA: result = cached_data else: # strategy==transform if isinstance(transformer, str): tfn = getattr(obj, transformer) assert callable(tfn) result = tfn(*args[1:], **kwargs) # exclude args[0]==self elif callable(transformer): result = transformer(*args, **kwargs) loaded = True except SkipCache: pass except Exception as e: logger.warning(e) if not loaded: result = fn(*args, **kwargs) if cache_path is not None and not loaded: try: for c in callbacks: c.on_store(fn, result, *args, **kwargs) # store cache cache_strategy = strategy if strategy is not None else cfg.cache_strategy if cache_strategy == _STRATEGY_TRANSFORM and ( result is None or transformer is not None): cache_data = None meta = {'strategy': _STRATEGY_TRANSFORM} else: cache_data = result meta = {'strategy': _STRATEGY_DATA} if attrs_to_restore is not None: meta['attributes'] = { k: getattr(obj, k, None) for k in attrs_to_restore } if isinstance(obj, BaseEstimator): meta['params_'] = obj.get_params(deep=False) # for info _store_cache(tb, cache_path, cache_data, meta=meta) for c in callbacks: c.on_leave(fn, *args, **kwargs) except Exception as e: logger.warning(e) return result
def fit_cross_validation( estimator_type, fit_fn, X, y, X_test=None, score_fn=roc_auc_score, estimator_params={}, categorical_feature=None, task_type=consts.TASK_BINARY, num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ): print("Start cross validation") print( f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}' ) # Cross validation model if iterators is None: if stratified: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001) print(f'Iterators:{iterators}') if len(y.shape) > 1: oof_proba = np.zeros(y.shape) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if preds_filepath is None and os.environ.get( consts.ENV_DEEPTABLES_HOME) is not None: preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME) if preds_filepath is None: preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/' if not fs.exists(preds_filepath): fs.makedirs(preds_filepath, exist_ok=True) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): print(f'\nFold:{n_fold + 1}\n') x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx] model = fit_fn( x_train_fold, y_train_fold, x_val_fold, y_val_fold, cat_vars=categorical_feature, task=task_type, estimator_params=estimator_params, ) print(f'Fold {n_fold + 1} finished.') proba = model.predict_proba(x_val_fold)[:, 1:2] oof_proba[valid_idx] = proba test_fold_proba = model.predict_proba(X_test) score = round(score_fn(y_val_fold, proba), 5) file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_fold_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} Score:{score}') if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) print(f'OOF score:{score_fn(y, oof_proba)}') return oof_proba