def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) X = self.inf_impute(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(exp_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_file)) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): orig_cols = list(X.names) df_varimp_orig, df_varimp, df_varimp_merged = self.get_df_varimp(model, orig_cols) dfmap = {k: v for k, v in zip(df_varimp_orig.index, df_varimp.index)} preds_df = model.predict_contributions(test_frame).as_data_frame(header=False) # this only has to work for regression and binary since h2o-3 does not support multiclass shapley preds_df.columns = [dfmap.get(x, x) for x in preds_df.columns] preds_df = preds_df.groupby(preds_df.columns, axis=1).sum() return preds_df.values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) is_final = 'IS_FINAL' in kwargs struuid = str(uuid.uuid4()) json_file = os.path.join(exp_dir(), 'stderr_is_final_%s_%s.json' % (is_final, struuid)) if self.num_classes == 1: if self.doing_p_values(): df = preds.iloc[:, 1] with open(json_file, "wt") as f: pd.set_option('precision', 16) f.write(json.dumps(json.loads(df.to_json()), indent=4)) pd.set_option('precision', 6) return preds.iloc[:, 0].values.ravel() else: return preds.values.ravel() elif self.num_classes == 2: if self.doing_p_values(): df = preds.iloc[:, 2] with open(json_file, "wt") as f: pd.set_option('precision', 16) f.write(json.dumps(json.loads(df.to_json()), indent=4)) pd.set_option('precision', 6) return preds.iloc[:, -1 - 1].values.ravel() else: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model h2o.remove(test_frame) remove(model_file) if preds_frame is not None: h2o.remove(preds_frame)
def logger(self): from h2oaicore import application_context from h2oaicore.systemutils import exp_dir # Don't assign to self, not picklable return make_experiment_logger( experiment_id=application_context.context.experiment_id, tmp_dir=None, experiment_tmp_dir=exp_dir())
def predict(self, X, y=None, **kwargs): model, features, importances, iterations = self.get_model_properties() if not self._save_by_pickle: from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType if self.num_classes >= 2: from_file = CatBoostClassifier() else: from_file = CatBoostRegressor() with open(self.model_path, mode='wb') as f: f.write(model) model = from_file.load_model(self.model_path) # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up. if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) X, eval_set = self.process_cats(X, None, self.feature_names_fitted) pred_contribs = kwargs.get('pred_contribs', False) output_margin = kwargs.get('output_margin', False) fast_approx = kwargs.pop('fast_approx', False) if fast_approx: iterations = min(config.fast_approx_num_trees, iterations) # implicit import from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool n_jobs = max(1, physical_cores_count) if not pred_contribs and not output_margin: if self.num_classes >= 2: preds = model.predict_proba( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if preds.shape[1] == 2: return preds[:, 1] else: return preds else: return model.predict( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) elif output_margin: # uses "predict" for raw for any class preds = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if len(preds.shape ) > 1 and preds.shape[1] == 2 and self.num_classes == 2: return preds[:, 1] else: return preds elif pred_contribs: # For Shapley, doesn't come from predict # For regression/binary, shap is shape of (rows, features + bias) # for multiclass, shap is shape of (rows, classes, features + bias) data = Pool(X, label=y, cat_features=self.params['cat_features']) if fast_approx: # https://github.com/catboost/catboost/issues/1146 # https://github.com/catboost/catboost/issues/1535 # can't specify trees, but they have approx version # Regular, Exact, or Approximate shap_calc_type = "Approximate" else: shap_calc_type = "Regular" # See also shap_mode # help(CatBoostClassifier.get_feature_importance) print_debug("shap_calc_type: %s" % shap_calc_type) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join( exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid) model.save_model( os.path.join(exp_dir(), "catshapproblem%s.catboost.model" % self.uuid)) # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) save_obj((model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) preds_shap = model.get_feature_importance( data=data, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported, type=EFstrType.ShapValues, shap_calc_type=shap_calc_type, ) # repair broken shap sum: https://github.com/catboost/catboost/issues/1125 print_debug("shap_fix") preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if self.num_classes <= 2: axis = 1 else: axis = 2 orig_sum = np.sum(preds_shap, axis=axis) print_debug("shap_fix2") # avoid division by 0, need different trick, e.g. change baseline, to fix that case if axis == 1: orig_sum[orig_sum[:] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:, None] else: # each feature and each class must sum up orig_sum[orig_sum[:, :] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, :, None] / orig_sum[:, :, None] if config.hard_asserts and config.debug_daimodel_level >= 2: print_debug("shap_check") model.save_model(os.path.join(exp_dir(), "catshapproblem")) pickle.dump((X, y, self.params['cat_features']), open(os.path.join(exp_dir(), "catshapproblem.pkl"), "wb")) preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) assert np.isclose(preds_raw, np.sum( preds_shap, axis=axis)).all( ), "catboost shapley does not sum up correctly" if config.debug_daimodel_level <= 2: remove(pickle_path) if axis == 1: return preds_shap else: # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ... return preds_shap.reshape( preds_shap.shape[0], preds_shap.shape[1] * preds_shap.shape[2]) else: raise RuntimeError("No such case")
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) X = self.inf_impute(X) self.transcribe(X=X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None if isinstance(self, H2ONBModel): # NB can only handle weights of 0 / 1 if sample_weight is not None: sample_weight = (sample_weight != 0).astype(int) if sample_weight_eval_set is not None and len(sample_weight_eval_set) > 0 and sample_weight_eval_set[ 0] is not None: sample_weight_eval_set1 = sample_weight_eval_set[0] sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1 sample_weight_eval_set1 = sample_weight_eval_set1.astype(int) sample_weight_eval_set = [sample_weight_eval_set1] X_pd = X.to_pandas() # fix if few levels for "enum" type. h2o-3 auto-type is too greedy and only looks at very first rows np_real_types = [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64] column_types = {} for col in X_pd.columns: if X_pd[col].dtype.type in np_real_types: column_types[col] = 'real' nuniques = {} for col in X_pd.columns: nuniques[col] = len(pd.unique(X_pd[col])) print_debug("NumUniques for col: %s: %d" % (col, nuniques[col])) if nuniques[col] <= config.max_int_as_cat_uniques and X_pd[col].dtype.type in np_real_types: # override original "real" column_types[col] = 'enum' # if column_types is partially filled, that is ok to h2o-3 train_X = h2o.H2OFrame(X_pd, column_types=column_types) self.col_types = train_X.types # see uniques-types dict nuniques_and_types = {} for col, typ, in self.col_types.items(): nuniques_and_types[col] = [typ, nuniques[col]] print_debug("NumUniques and types for col: %s : %s" % (col, nuniques_and_types[col])) train_y = h2o.H2OFrame(y, column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = h2o.H2OFrame(eval_set[0][1], column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() params = copy.deepcopy(self.params) if not isinstance(self, H2OAutoMLModel): # AutoML needs max_runtime_secs in initializer, all others in train() method max_runtime_secs = params.pop('max_runtime_secs', 0) train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight # Don't ever use the offset column as a feature offset_col = None # if no column is called offset we will pass "None" and not use this feature cols_to_train = [] # list of all non-offset columns for col in list(train_X.names): if not col.lower() == "offset": cols_to_train.append(col) else: offset_col = col orig_cols = cols_to_train # not training on offset if self.doing_p_values(): # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/compute_p_values.html # take a look at the coefficients_table to see the p_values params['remove_collinear_columns'] = True params['compute_p_values'] = True # h2o-3 only supports p-values if lambda=0 params['lambda_'] = 0 if self.num_classes == 2: params['family'] = 'binomial' params['solver'] = 'IRLSM' params.pop('beta_constraints', None) trials = 2 for trial in range(0, trials): try: # Models that can use an offset column loggerinfo(self.get_logger(**kwargs), "%s (%s) fit parameters: %s" % ( self.display_name, self.__class__.__module__, dict(params))) model = self.make_instance(**params) if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel): model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col, **train_kwargs) else: model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) break except Exception as e: print(str(e)) t, v, tb = sys.exc_info() ex = ''.join(traceback.format_exception(t, v, tb)) if 'Training data must have at least 2 features' in str(ex) and X.ncols != 0: # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case raise IgnoreEntirelyError elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 0: # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows params['min_rows'] = 1 # go down to lowest value # permit another trial elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 1: raise IgnoreEntirelyError elif " java.lang.AssertionError" in str(ex): # bug in h2o-3, nothing can be done raise IgnoreEntirelyError elif "NotStrictlyPositiveException" in str(ex): # bad input data for given hyperparameters raise IgnoreEntirelyError else: raise if trial == trials - 1: # if at end of trials, raise no matter what raise if self._show_performance: # retrieve the model performance perf_train = model.model_performance(train_frame) loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_train, which="training")) if valid_frame is not None: perf_valid = model.model_performance(valid_frame) loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_valid, which="validation")) struuid = str(uuid.uuid4()) if self._show_coefficients: coeff_table = model._model_json['output']['coefficients_table'] # convert table to a pandas dataframe coeff_table = coeff_table.as_data_frame() is_final = 'IS_FINAL' in kwargs json_file = os.path.join(exp_dir(), 'coefficients_table_is_final_%s_%s.json' % (is_final, struuid)) with open(json_file, "wt") as f: pd.set_option('precision', 16) f.write(json.dumps(json.loads(coeff_table.to_json()), indent=4)) pd.set_option('precision', 6) if isinstance(model, H2OAutoML): model = model.leader self.id = model.model_id model_path = os.path.join(exp_dir(), "h2o_model." + struuid) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]: if xx is not None: if isinstance(xx, H2OAutoML): h2o.remove(xx.project_name) else: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: _, _, df_varimp = self.get_df_varimp(model, orig_cols) missing_features_set = set([x for x in orig_cols if x not in list(df_varimp.index)]) # must not keep "missing features", even as zero, since h2o-3 won't have them in pred_contribs output orig_cols = [x for x in orig_cols if x not in missing_features_set] self.col_types = {k: v for k, v in self.col_types.items() if k not in missing_features_set} varimp = df_varimp[orig_cols].values # order by (and select) fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))