def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) orig_cols = list(X.names) if self.num_classes >= 2: mod = linsvc(random_state=self.random_state, C=self.params["C"], penalty=self.params["penalty"], loss=self.params["loss"], dual=self.params["dual"]) kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state) model = CalibratedClassifierCV(base_estimator=mod, method='isotonic', cv=kf) lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) else: model = LinearSVR(epsilon=self.params["epsilon"], C=self.params["C"], loss=self.params["loss"], dual=self.params["dual"], random_state=self.random_state) self.means = dict() self.standard_scaler = StandardScaler() X = self.basic_impute(X) X = X.to_numpy() X = self.standard_scaler.fit_transform(X) try: model.fit(X, y, sample_weight=sample_weight) except Exception as e: if 'cross-validation but provided less than' in str(e): raise IgnoreEntirelyError(str(e)) raise importances = np.array([0.0 for k in range(len(orig_cols))]) if self.num_classes >= 2: for classifier in model.calibrated_classifiers_: importances += np.array( abs(classifier.base_estimator.get_coeff())) else: importances += np.array(abs(model.coef_[0])) self.set_model_properties( model=model, features=orig_cols, importances=importances.tolist(), # abs(model.coef_[0]) iterations=0)
def transform(self, X: dt.Frame, y: np.array = None): if ngpus_vis == 0: raise IgnoreEntirelyError("Transformer cannot run without GPUs") import cudf import cuml cuml.common.memory_utils.set_global_output_type('numpy') X = X.to_pandas().fillna(0) X = cudf.DataFrame(X) return self.model.predict(X)
def fit_transform(self, X: dt.Frame, y: np.array = None): if ngpus_vis == 0: raise IgnoreEntirelyError("Transformer cannot run without GPUs") import cudf import cuml cuml.common.memory_utils.set_global_output_type('numpy') self.n_clusters = min(self.n_clusters, X.nrows) self.model = cuml.cluster.KMeans(n_clusters=self.n_clusters, max_iter=self.max_iters, tol=self.tol) X = X.to_pandas().fillna(0) X = cudf.DataFrame(X) return self.model.fit_predict(X)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) self.params_override() params = self.params.copy() if params.get('model_type', 'lda') == 'lda': model_class = LinearDiscriminantAnalysis params.pop('reg_param', None) else: model_class = QuadraticDiscriminantAnalysis params.pop('solver', None) params.pop('model_type', None) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = model_class(**params) else: model = model_class(**params) X = self.basic_impute(X) X = X.to_numpy() try: model.fit(X, y) except np.linalg.LinAlgError as e: # nothing can be done, just revert to constant predictions raise IgnoreEntirelyError(str(e)) importances = np.array([1 for x in range(len(orig_cols))]) self.set_model_properties(model=model, features=orig_cols, importances=importances.tolist(), iterations=1)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): y_ = y.copy() orig_cols = list(X.names) text_names = X[:, [str]].names self.loaded = False self.load_path = get_value(config, self.load_key) self.save_path = get_value(config, self.save_key) if self.load_path: data = joblib.load(self.load_path) self.tfidf_objs = data["tf_idf_obj"] self.tf_idf_data = data["tf_idf_data"] self.prev_params = data["params"] self.target = data["target"] self.loaded = True if not self.loaded: if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.tfidf_objs = {} self.tf_idf_data = {} new_X = None for col in text_names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() tfidf_vec = TfidfVectorizer(**self.return_tfidf_params()) try: tfidf_vec = self._fit_vectorizer(tfidf_vec, XX) except ValueError as e: if 'vocab' in str(e): # skip non-text-like column continue else: raise XX = tfidf_vec.transform(XX) tfidf_vec.N_ = XX.shape[0] self.tfidf_objs[col] = tfidf_vec self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) else: y_ = np.hstack([self.target, y_]) y = y_.copy() if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) new_X = None for col in text_names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() N_ = len(XX) tfidf_vec = TfidfVectorizer() tfidf_vec.set_params(**self.tfidf_objs[col].get_params()) try: tfidf_vec.fit(XX) new_data_avail = True except ValueError as e: if 'vocab' in str(e): # skip non-text-like column continue new_data_avail = False if new_data_avail: tfidf_vec.N_ = N_ pre_trained = self.tfidf_objs[col] pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec) else: pre_trained = self.tfidf_objs[col] XX = pre_trained.transform(XX) self.tfidf_objs[col] = pre_trained XX = self.sync_tfidf(self.tf_idf_data[col], XX) self.tf_idf_data[col] = XX if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) models = [LogisticRegression(**self.return_lin_params())] if self.params["add_rf"]: from h2oaicore.lightgbm_dynamic import import_lightgbm lgbm = import_lightgbm() import lightgbm as lgbm models.append(lgbm.LGBMClassifier( boosting_type='rf', colsample_bytree=.5, subsample=.632, # Standard RF bagging fraction min_child_weight=2.5, min_child_samples=5, subsample_freq=1, min_split_gain=0, n_jobs=-1, **self.return_rf_params() )) for mi, m in enumerate(models): try: m.fit(new_X, y) except ValueError as e: # general mutation as specified is not alllowed, see logistic_regression recipe. # Could use restricted choices there, but for simplicity just ignore the error entirely if mi == 0: raise IgnoreEntirelyError(str(e)) raise importances = [1] * len(orig_cols) self.set_model_properties( model={ "model": models, "tf-idfs": self.tfidf_objs }, features=orig_cols, importances=importances, iterations=0 ) if self.save_path: joblib.dump({ "tf_idf_obj": self.tfidf_objs, "tf_idf_data": self.tf_idf_data, "params": self.params, "target": y_, }, self.save_path ) # clear large objects to avoid large data in subprocess pipe self.tfidf_objs = None self.tf_idf_data = None
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations)