def fit(self, X, y, X_validation=None, y_validation=None, log_to_file=None): if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = (X_validation, y_validation) self.model.fit( X, y, cat_features=self.cat_features, init_model=None if self.model.tree_count_ is None else self.model, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] result = pd.DataFrame( { "iteration": range(len(self.model.evals_result_["learn"][metric_name])), "train": self.model.evals_result_["learn"][metric_name], "validation": self.model.evals_result_["validation"][metric_name], } ) result.to_csv(log_to_file, index=False, header=False)
def compute(X, y, machinelearning_task): columns_info = {} for col in X.columns: columns_info[col] = [] # empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] if empty_column: columns_info[col] += ["empty_column"] continue # constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if constant_column: columns_info[col] += ["constant_column"] continue # if PreprocessingUtils.is_na(X[col]): columns_info[col] += ["missing_values"] # if PreprocessingUtils.is_categorical(X[col]): columns_info[col] += ["categorical"] columns_info[col] += [EncodingSelector.get(X, y, col)] elif PreprocessingUtils.is_datetime(X[col]): columns_info[col] += ["datetime_transform"] elif PreprocessingUtils.is_text(X[col]): columns_info[col] = ["text_transform" ] # override other transforms else: # numeric type, check if scale needed if PreprocessingUtils.is_scale_needed(X[col]): columns_info[col] += ["scale"] target_info = [] if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_info += ["convert_0_1"] if machinelearning_task == REGRESSION: if PreprocessingUtils.is_log_scale_needed(y): target_info += ["scale_log"] elif PreprocessingUtils.is_scale_needed(y): target_info += ["scale"] num_class = None if machinelearning_task == MULTICLASS_CLASSIFICATION: num_class = PreprocessingUtils.num_class(y) return { "columns_info": columns_info, "target_info": target_info, "num_class": num_class, }
def fit(self, X, y): if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] self.model.fit( X, y, cat_features=self.cat_features, init_model=None if self.model.tree_count_ is None else self.model, )
def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): if self.is_fitted(): print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) if self.params.get("num_boost_round") is None: model_init, new_iterations = self._assess_iterations( X, y, sample_weight, eval_set, max_time) self.model.set_params(iterations=new_iterations) else: model_init = None self.model.set_params( iterations=self.params.get("num_boost_round")) self.early_stopping_rounds = self.params.get( "early_stopping_rounds", 50) self.model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if self.model.best_iteration_ is not None: if model_init is not None: self.best_ntree_limit = (self.model.best_iteration_ + model_init.tree_count_ + 1) else: self.best_ntree_limit = self.model.best_iteration_ + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: train_scores = self.model.evals_result_["learn"].get( self.log_metric_name) validation_scores = self.model.evals_result_["validation"].get( self.log_metric_name) if model_init is not None: if train_scores is not None: train_scores = (model_init.evals_result_["learn"].get( self.log_metric_name) + train_scores) if validation_scores is not None: validation_scores = ( model_init.evals_result_["validation"].get( self.log_metric_name) + validation_scores) iteration = None if train_scores is not None: iteration = range(len(validation_scores)) elif validation_scores is not None: iteration = range(len(validation_scores)) result = pd.DataFrame({ "iteration": iteration, "train": train_scores, "validation": validation_scores, }) result.to_csv(log_to_file, index=False, header=False)
def get(required_preprocessing, data, machinelearning_task): X = data["train"]["X"] y = data["train"]["y"] columns_preprocessing = {} for col in X.columns: preprocessing_to_apply = [] # remove empty columns and columns with only one variable empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if empty_column or constant_column: preprocessing_to_apply += ["remove_column"] columns_preprocessing[col] = preprocessing_to_apply continue # always check for missing values if ( "missing_values_inputation" in required_preprocessing and PreprocessingUtils.is_na(X[col]) ): preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN] # convert to categorical only for categorical types convert_to_integer_will_be_applied = False if ( "convert_categorical" in required_preprocessing and PreprocessingUtils.is_categorical(X[col]) ): preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] convert_to_integer_will_be_applied = True if "scale" in required_preprocessing: if convert_to_integer_will_be_applied: preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL] # elif PreprocessingUtils.is_log_scale_needed(X[col]): # preprocessing_to_apply += [PreprocessingScale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(X[col]): preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL] # remeber which preprocessing we need to apply if preprocessing_to_apply: columns_preprocessing[col] = preprocessing_to_apply target_preprocessing = [] # always remove missing values from target, # missing values might be in train and in validation datasets target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE] if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: if PreprocessingUtils.is_categorical(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == REGRESSION: if PreprocessingUtils.is_log_scale_needed(y): target_preprocessing += [PreprocessingScale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(y): target_preprocessing += [PreprocessingScale.SCALE_NORMAL] return { "columns_preprocessing": columns_preprocessing, "target_preprocessing": target_preprocessing, }
def fit(self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None): if self.model.tree_count_ is not None: print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) # disable for now ... model_init, new_iterations = self._assess_iterations( X, y, eval_set, max_time) self.model.set_params(iterations=new_iterations) self.model.fit(X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False) if self.model.best_iteration_ is not None: self.best_ntree_limit = self.model.best_iteration_ + self.warmup_iterations + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] train_scores = self.model.evals_result_["learn"][metric_name] validation_scores = self.model.evals_result_["validation"][ metric_name] if model_init is not None: train_scores = model_init.evals_result_["learn"][ metric_name] + train_scores validation_scores = model_init.evals_result_["validation"][ metric_name] + validation_scores result = pd.DataFrame({ "iteration": range(len(train_scores)), "train": train_scores, "validation": validation_scores, }) result.to_csv(log_to_file, index=False, header=False)
def get(required_preprocessing, data, machinelearning_task): X = data["train"]["X"] y = data["train"]["y"] columns_preprocessing = {} for col in X.columns: preprocessing_to_apply = [] # remove empty columns and columns with only one variable empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 if empty_column or constant_column: preprocessing_to_apply += ["remove_column"] columns_preprocessing[col] = preprocessing_to_apply continue # always check for missing values if ( "missing_values_inputation" in required_preprocessing and PreprocessingUtils.is_na(X[col]) ): preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN] # convert to categorical only for categorical types convert_to_integer_will_be_applied = False if ( "convert_categorical" in required_preprocessing and PreprocessingUtils.is_categorical(X[col]) ): preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] convert_to_integer_will_be_applied = True if "scale" in required_preprocessing: if convert_to_integer_will_be_applied: preprocessing_to_apply += [Scale.SCALE_NORMAL] # elif PreprocessingUtils.is_log_scale_needed(X[col]): # preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(X[col]): preprocessing_to_apply += [Scale.SCALE_NORMAL] # remeber which preprocessing we need to apply if preprocessing_to_apply: columns_preprocessing[col] = preprocessing_to_apply target_preprocessing = [] # always remove missing values from target, # target with missing values might be in the train and in the validation datasets target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE] if "target_as_integer" in required_preprocessing: if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: # if PreprocessingUtils.is_categorical(y): # always convert to integer, there can be many situations that can break # for example, classes starting from 1, ... # or classes not for every number, for example 0,2,3,4 # just always convert target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] elif "target_as_one_hot" in required_preprocessing: target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT] if ( machinelearning_task == REGRESSION and "target_scale" in required_preprocessing ): if PreprocessingUtils.is_log_scale_needed(y): target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL] elif PreprocessingUtils.is_scale_needed(y): target_preprocessing += [Scale.SCALE_NORMAL] """ if machinelearning_task == BINARY_CLASSIFICATION: if not PreprocessingUtils.is_0_1(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] if machinelearning_task == MULTICLASS_CLASSIFICATION: if PreprocessingUtils.is_categorical(y): target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] """ return { "columns_preprocessing": columns_preprocessing, "target_preprocessing": target_preprocessing, "ml_task": machinelearning_task, }
def optimize( self, algorithm, data_type, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, learner_params, ): # only tune models with original data type if data_type != "original": return learner_params key = f"{data_type}_{algorithm}" if key in self.tuning: return self.update_learner_params(learner_params, self.tuning[key]) if self.verbose: print( f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds " f"eval_metric {self.eval_metric.name} ({self.direction})") self.cat_features_indices = [] for i in range(X_train.shape[1]): if PreprocessingUtils.is_categorical(X_train.iloc[:, i]): self.cat_features_indices += [i] study = optuna.create_study( direction=self.direction, sampler=optuna.samplers.TPESampler(seed=self.random_state), pruner=optuna.pruners.MedianPruner( n_warmup_steps=self.n_warmup_steps), ) obejctive = None if algorithm == "LightGBM": objective = LightgbmObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Xgboost": objective = XgboostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "CatBoost": objective = CatBoostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Random Forest": objective = RandomForestObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Extra Trees": objective = ExtraTreesObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Nearest Neighbors": objective = KNNObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Neural Network": objective = NeuralNetworkObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) study.optimize(objective, n_trials=5000, timeout=self.time_budget) self.plot_study(algorithm, data_type, study) joblib.dump(study, os.path.join(self.study_dir, key + ".joblib")) best = study.best_params if algorithm == "LightGBM": best["metric"] = objective.eval_metric_name best["custom_eval_metric_name"] = objective.custom_eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["learning_rate"] = objective.learning_rate best["cat_feature"] = self.cat_features_indices best["feature_pre_filter"] = False best["seed"] = objective.seed elif algorithm == "CatBoost": best["eval_metric"] = objective.eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["bootstrap_type"] = "Bernoulli" # best["learning_rate"] = objective.learning_rate best["seed"] = objective.seed elif algorithm == "Xgboost": best["objective"] = objective.objective best["eval_metric"] = objective.eval_metric_name # best["eta"] = objective.learning_rate best["max_rounds"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds best["seed"] = objective.seed elif algorithm == "Extra Trees": # Extra Trees are not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Random Forest": # Random Forest is not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Nearest Neighbors": best["rows_limit"] = 100000 elif algorithm == "Neural Network": pass self.tuning[key] = best self.save() return self.update_learner_params(learner_params, best)