def save(self, results_path, model_subpath): start_time = time.time() model_path = os.path.join(results_path, model_subpath) logger.info(f"Save the model {model_path}") type_of_predictions = ( "validation" if "k_folds" not in self.validation_params else "out_of_folds" ) predictions_fname = os.path.join( model_subpath, f"predictions_{type_of_predictions}.csv" ) self._oof_predictions_fname = os.path.join(results_path, predictions_fname) predictions = self.get_out_of_folds() predictions.to_csv(self._oof_predictions_fname, index=False) saved = [os.path.join(model_subpath, l.get_fname()) for l in self.learners] with open(os.path.join(model_path, "framework.json"), "w") as fout: preprocessing = [p.to_json() for p in self.preprocessings] learners_params = [learner.get_params() for learner in self.learners] desc = { "uid": self.uid, "name": self._name, "preprocessing": preprocessing, "learners": learners_params, "params": self.params, "saved": saved, "predictions_fname": predictions_fname, "metric_name": self.get_metric_name(), "final_loss": self.get_final_loss(), "train_time": self.get_train_time(), "is_stacked": self._is_stacked, } if self._threshold is not None: desc["threshold"] = self._threshold fout.write(json.dumps(desc, indent=4)) learning_curve_metric = self.learners[0].get_metric_name() if learning_curve_metric is None: learning_curve_metric = self.get_metric_name() LearningCurves.plot( [l.name for l in self.learners], learning_curve_metric, model_path, trees_in_iteration=self.additional_params.get("trees_in_step"), ) # call additional metics just to be sure they are computed self._additional_metrics = self.get_additional_metrics() AdditionalMetrics.save( self._additional_metrics, self._ml_task, self.model_markdown(), model_path ) with open(os.path.join(model_path, "status.txt"), "w") as fout: fout.write("ALL OK!") # I'm adding save time to total train time # there is always save after the training self.train_time += time.time() - start_time
def save(self, model_path): logger.info(f"Save the ensemble to {model_path}") with open(os.path.join(model_path, "ensemble.json"), "w") as fout: ms = [] for selected in self.selected_models: ms += [{"model": selected["model"]._name, "repeat": selected["repeat"]}] desc = { "name": self._name, "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "selected_models": ms, } if self._threshold is not None: desc["threshold"] = self._threshold fout.write(json.dumps(desc, indent=4)) predictions = self.get_out_of_folds() predictions.to_csv( os.path.join(model_path, f"predictions_ensemble.csv"), index=False ) LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path) self._additional_metrics = self.get_additional_metrics() AdditionalMetrics.save( self._additional_metrics, self._ml_task, self.model_markdown(), model_path ) with open(os.path.join(model_path, "status.txt"), "w") as fout: fout.write("ALL OK!")
def save(self, model_path): start_time = time.time() logger.info(f"Save the model {model_path}") type_of_predictions = ("validation" if "k_folds" not in self.validation_params else "out_of_folds") predictions_fname = os.path.join( model_path, f"predictions_{type_of_predictions}.csv") predictions = self.get_out_of_folds() predictions.to_csv(predictions_fname, index=False) saved = [] for i, l in enumerate(self.learners): p = os.path.join(model_path, f"learner_{i+1}.{l.file_extension()}") # l.save(p) saved += [p] with open(os.path.join(model_path, "framework.json"), "w") as fout: preprocessing = [p.to_json() for p in self.preprocessings] learners_params = [ learner.get_params() for learner in self.learners ] desc = { "uid": self.uid, "name": self._name, "preprocessing": preprocessing, "learners": learners_params, "params": self.params, "saved": saved, "predictions_fname": predictions_fname, "metric_name": self.get_metric_name(), "final_loss": self.get_final_loss(), "train_time": self.get_train_time(), "is_stacked": self._is_stacked, } if self._threshold is not None: desc["threshold"] = self._threshold fout.write(json.dumps(desc, indent=4)) LearningCurves.plot( self.validation.get_n_splits(), self.get_metric_name(), model_path, trees_in_iteration=self.additional_params.get("trees_in_step"), ) self._additional_metrics = self.get_additional_metrics() AdditionalMetrics.save(self._additional_metrics, self._ml_task, self.model_markdown(), model_path) with open(os.path.join(model_path, "status.txt"), "w") as fout: fout.write("ALL OK!") # I'm adding save time to total train time # there is always save after the training self.train_time += time.time() - start_time
def get_additional_metrics(self): if self._additional_metrics is None: # 'target' - the target after processing used for model training # 'prediction' - out of folds predictions of the model oof_predictions = self.get_out_of_folds() prediction_cols = [ c for c in oof_predictions.columns if "prediction" in c ] target_cols = [c for c in oof_predictions.columns if "target" in c] target = oof_predictions[target_cols] oof_preds = None if self._ml_task == MULTICLASS_CLASSIFICATION: oof_preds = self.preprocessings[0].prepare_target_labels( oof_predictions[prediction_cols].values) else: oof_preds = oof_predictions[prediction_cols] sample_weight = None if "sample_weight" in oof_predictions.columns: sample_weight = oof_predictions["sample_weight"] self._additional_metrics = AdditionalMetrics.compute( target, oof_preds, sample_weight, self._ml_task) if self._ml_task == BINARY_CLASSIFICATION: self._threshold = float(self._additional_metrics["threshold"]) return self._additional_metrics
def test_compute_for_regression(self): target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) info = AdditionalMetrics.compute(target, pred, None, REGRESSION) all_metrics = list(info["max_metrics"]["Metric"].values) for m in ["MAE", "MSE", "RMSE", "R2"]: self.assertTrue(m in all_metrics)
def get_additional_metrics(self): if self._additional_metrics is None: logger.debug("Get additional metrics for Ensemble") # 'target' - the target after processing used for model training # 'prediction' - out of folds predictions of the model oof_predictions = self.get_out_of_folds() prediction_cols = [ c for c in oof_predictions.columns if "prediction" in c ] target_cols = [c for c in oof_predictions.columns if "target" in c] oof_preds = oof_predictions[prediction_cols] if self._ml_task == MULTICLASS_CLASSIFICATION: cols = oof_preds.columns.tolist() # prediction_ labels = {i: v[11:] for i, v in enumerate(cols)} oof_preds["label"] = np.argmax(np.array( oof_preds[prediction_cols]), axis=1) oof_preds["label"] = oof_preds["label"].map(labels) sample_weight = None if "sample_weight" in oof_predictions.columns: sample_weight = oof_predictions["sample_weight"] self._additional_metrics = AdditionalMetrics.compute( oof_predictions[target_cols], oof_preds, sample_weight, self._ml_task) if self._ml_task == BINARY_CLASSIFICATION: self._threshold = float(self._additional_metrics["threshold"]) return self._additional_metrics
def save(self, results_path, model_subpath): model_path = os.path.join(results_path, model_subpath) logger.info(f"Save the ensemble to {model_path}") predictions = self.get_out_of_folds() predictions_fname = os.path.join(model_subpath, f"predictions_ensemble.csv") self._oof_predictions_fname = os.path.join(results_path, predictions_fname) predictions.to_csv(self._oof_predictions_fname, index=False) with open(os.path.join(model_path, "ensemble.json"), "w") as fout: ms = [] for selected in self.selected_models: ms += [{ "model": selected["model"]._name, "repeat": selected["repeat"] }] desc = { "name": self._name, "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "selected_models": ms, "predictions_fname": predictions_fname, "metric_name": self.get_metric_name(), "final_loss": self.get_final_loss(), "train_time": self.get_train_time(), "is_stacked": self._is_stacked, } if self._threshold is not None: desc["threshold"] = self._threshold fout.write(json.dumps(desc, indent=4)) LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path) # call additional metics just to be sure they are computed self._additional_metrics = self.get_additional_metrics() AdditionalMetrics.save(self._additional_metrics, self._ml_task, self.model_markdown(), model_path) with open(os.path.join(model_path, "status.txt"), "w") as fout: fout.write("ALL OK!")
def save(self, model_path): logger.info(f"Save the model {model_path}") saved = [] for i, l in enumerate(self.learners): p = os.path.join(model_path, f"learner_{i+1}.{l.file_extenstion()}") l.save(p) saved += [p] with open(os.path.join(model_path, "framework.json"), "w") as fout: preprocessing = [p.to_json() for p in self.preprocessings] learners_params = [ learner.get_params() for learner in self.learners ] desc = { "uid": self.uid, "name": self._name, "preprocessing": preprocessing, "learners": learners_params, "params": self.params, "saved": saved, } if self._threshold is not None: desc["threshold"] = self._threshold fout.write(json.dumps(desc, indent=4)) type_of_predictions = ("validation" if "k_folds" not in self.validation_params else "out_of_folds") predictions = self.get_out_of_folds() predictions.to_csv( os.path.join(model_path, f"predictions_{type_of_predictions}.csv"), index=False, ) LearningCurves.plot(self.validation.get_n_splits(), self.get_metric_name(), model_path) self._additional_metrics = self.get_additional_metrics() AdditionalMetrics.save(self._additional_metrics, self._ml_task, self.model_markdown(), model_path) with open(os.path.join(model_path, "status.txt"), "w") as fout: fout.write("ALL OK!")
def test_compute_constant_preds(self): target = np.array([0, 0, 1, 1, 0, 0, 0, 0]) pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION) details = info["metric_details"] max_metrics = info["max_metrics"] conf = info["confusion_matrix"] self.assertTrue(max_metrics["f1"]["score"] < 1) self.assertTrue(max_metrics["mcc"]["score"] < 1)
def test_compute_f1(self): target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION) details = info["metric_details"] max_metrics = info["max_metrics"] conf = info["confusion_matrix"] self.assertEqual(max_metrics["f1"]["score"], 1) self.assertTrue(details is not None) self.assertTrue(conf is not None)
def test_compute(self): target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8]) info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION) details = info["metric_details"] max_metrics = info["max_metrics"] conf = info["confusion_matrix"] self.assertEqual(conf.iloc[0, 0], 3) self.assertEqual(conf.iloc[1, 1], 3) self.assertTrue(details is not None) self.assertTrue(max_metrics is not None)
def get_additional_metrics(self): if self._additional_metrics is None: logger.debug("Get additional metrics for Ensemble") # 'target' - the target after processing used for model training # 'prediction' - out of folds predictions of the model oof_predictions = self.get_out_of_folds() prediction_cols = [ c for c in oof_predictions.columns if "prediction" in c ] target_cols = [c for c in oof_predictions.columns if "target" in c] print(oof_predictions) print(prediction_cols) print(target_cols) # need to prepare label for multiclass # print("ensemble") # print(oof_predictions[prediction_cols]) # oof_preds = self.preprocessings[0].prepare_target_labels( # oof_predictions[prediction_cols].values # ) oof_preds = oof_predictions[prediction_cols] if self._ml_task == MULTICLASS_CLASSIFICATION: cols = oof_preds.columns.tolist() # prediction_ labels = {i: v[11:] for i, v in enumerate(cols)} oof_preds["label"] = np.argmax(np.array( oof_preds[prediction_cols]), axis=1) oof_preds["label"] = oof_preds["label"].map(labels) self._additional_metrics = AdditionalMetrics.compute( oof_predictions[target_cols], oof_preds, # oof_predictions[prediction_cols], self._ml_task, ) if self._ml_task == BINARY_CLASSIFICATION: self._threshold = float(self._additional_metrics["threshold"]) print(self._additional_metrics["max_metrics"]) print(self._threshold) return self._additional_metrics