Пример #1
0
    def save(self, results_path, model_subpath):
        start_time = time.time()
        model_path = os.path.join(results_path, model_subpath)
        logger.info(f"Save the model {model_path}")

        type_of_predictions = (
            "validation" if "k_folds" not in self.validation_params else "out_of_folds"
        )
        predictions_fname = os.path.join(
            model_subpath, f"predictions_{type_of_predictions}.csv"
        )
        self._oof_predictions_fname = os.path.join(results_path, predictions_fname)
        predictions = self.get_out_of_folds()
        predictions.to_csv(self._oof_predictions_fname, index=False)

        saved = [os.path.join(model_subpath, l.get_fname()) for l in self.learners]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [learner.get_params() for learner in self.learners]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
                "predictions_fname": predictions_fname,
                "metric_name": self.get_metric_name(),
                "final_loss": self.get_final_loss(),
                "train_time": self.get_train_time(),
                "is_stacked": self._is_stacked,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        learning_curve_metric = self.learners[0].get_metric_name()
        if learning_curve_metric is None:
            learning_curve_metric = self.get_metric_name()

        LearningCurves.plot(
            [l.name for l in self.learners],
            learning_curve_metric,
            model_path,
            trees_in_iteration=self.additional_params.get("trees_in_step"),
        )

        # call additional metics just to be sure they are computed
        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(
            self._additional_metrics, self._ml_task, self.model_markdown(), model_path
        )

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
        # I'm adding save time to total train time
        # there is always save after the training
        self.train_time += time.time() - start_time
Пример #2
0
    def save(self, model_path):
        logger.info(f"Save the ensemble to {model_path}")

        with open(os.path.join(model_path, "ensemble.json"), "w") as fout:
            ms = []
            for selected in self.selected_models:
                ms += [{"model": selected["model"]._name, "repeat": selected["repeat"]}]

            desc = {
                "name": self._name,
                "ml_task": self._ml_task,
                "optimize_metric": self._optimize_metric,
                "selected_models": ms,
            }

            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        predictions = self.get_out_of_folds()
        predictions.to_csv(
            os.path.join(model_path, f"predictions_ensemble.csv"), index=False
        )

        LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path)

        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(
            self._additional_metrics, self._ml_task, self.model_markdown(), model_path
        )

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
Пример #3
0
    def save(self, model_path):
        start_time = time.time()
        logger.info(f"Save the model {model_path}")

        type_of_predictions = ("validation"
                               if "k_folds" not in self.validation_params else
                               "out_of_folds")
        predictions_fname = os.path.join(
            model_path, f"predictions_{type_of_predictions}.csv")
        predictions = self.get_out_of_folds()
        predictions.to_csv(predictions_fname, index=False)

        saved = []
        for i, l in enumerate(self.learners):
            p = os.path.join(model_path, f"learner_{i+1}.{l.file_extension()}")
            # l.save(p)
            saved += [p]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [
                learner.get_params() for learner in self.learners
            ]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
                "predictions_fname": predictions_fname,
                "metric_name": self.get_metric_name(),
                "final_loss": self.get_final_loss(),
                "train_time": self.get_train_time(),
                "is_stacked": self._is_stacked,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        LearningCurves.plot(
            self.validation.get_n_splits(),
            self.get_metric_name(),
            model_path,
            trees_in_iteration=self.additional_params.get("trees_in_step"),
        )

        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(self._additional_metrics, self._ml_task,
                               self.model_markdown(), model_path)

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
        # I'm adding save time to total train time
        # there is always save after the training
        self.train_time += time.time() - start_time
Пример #4
0
    def get_additional_metrics(self):

        if self._additional_metrics is None:
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            target = oof_predictions[target_cols]

            oof_preds = None
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                oof_preds = self.preprocessings[0].prepare_target_labels(
                    oof_predictions[prediction_cols].values)
            else:
                oof_preds = oof_predictions[prediction_cols]

            sample_weight = None
            if "sample_weight" in oof_predictions.columns:
                sample_weight = oof_predictions["sample_weight"]

            self._additional_metrics = AdditionalMetrics.compute(
                target, oof_preds, sample_weight, self._ml_task)
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])
        return self._additional_metrics
Пример #5
0
 def test_compute_for_regression(self):
     target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
     pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
     info = AdditionalMetrics.compute(target, pred, None, REGRESSION)
     all_metrics = list(info["max_metrics"]["Metric"].values)
     for m in ["MAE", "MSE", "RMSE", "R2"]:
         self.assertTrue(m in all_metrics)
Пример #6
0
    def get_additional_metrics(self):
        if self._additional_metrics is None:
            logger.debug("Get additional metrics for Ensemble")
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            oof_preds = oof_predictions[prediction_cols]
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                cols = oof_preds.columns.tolist()
                # prediction_
                labels = {i: v[11:] for i, v in enumerate(cols)}

                oof_preds["label"] = np.argmax(np.array(
                    oof_preds[prediction_cols]),
                                               axis=1)
                oof_preds["label"] = oof_preds["label"].map(labels)

            sample_weight = None
            if "sample_weight" in oof_predictions.columns:
                sample_weight = oof_predictions["sample_weight"]

            self._additional_metrics = AdditionalMetrics.compute(
                oof_predictions[target_cols], oof_preds, sample_weight,
                self._ml_task)
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])

        return self._additional_metrics
Пример #7
0
    def save(self, results_path, model_subpath):
        model_path = os.path.join(results_path, model_subpath)
        logger.info(f"Save the ensemble to {model_path}")

        predictions = self.get_out_of_folds()
        predictions_fname = os.path.join(model_subpath,
                                         f"predictions_ensemble.csv")
        self._oof_predictions_fname = os.path.join(results_path,
                                                   predictions_fname)
        predictions.to_csv(self._oof_predictions_fname, index=False)

        with open(os.path.join(model_path, "ensemble.json"), "w") as fout:
            ms = []
            for selected in self.selected_models:
                ms += [{
                    "model": selected["model"]._name,
                    "repeat": selected["repeat"]
                }]

            desc = {
                "name": self._name,
                "ml_task": self._ml_task,
                "optimize_metric": self._optimize_metric,
                "selected_models": ms,
                "predictions_fname": predictions_fname,
                "metric_name": self.get_metric_name(),
                "final_loss": self.get_final_loss(),
                "train_time": self.get_train_time(),
                "is_stacked": self._is_stacked,
            }

            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        LearningCurves.plot_for_ensemble(self._scores, self.metric.name,
                                         model_path)

        # call additional metics just to be sure they are computed
        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(self._additional_metrics, self._ml_task,
                               self.model_markdown(), model_path)

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
Пример #8
0
    def save(self, model_path):
        logger.info(f"Save the model {model_path}")

        saved = []
        for i, l in enumerate(self.learners):
            p = os.path.join(model_path,
                             f"learner_{i+1}.{l.file_extenstion()}")
            l.save(p)
            saved += [p]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [
                learner.get_params() for learner in self.learners
            ]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        type_of_predictions = ("validation"
                               if "k_folds" not in self.validation_params else
                               "out_of_folds")
        predictions = self.get_out_of_folds()
        predictions.to_csv(
            os.path.join(model_path, f"predictions_{type_of_predictions}.csv"),
            index=False,
        )

        LearningCurves.plot(self.validation.get_n_splits(),
                            self.get_metric_name(), model_path)

        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(self._additional_metrics, self._ml_task,
                               self.model_markdown(), model_path)

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
 def test_compute_constant_preds(self):
     target = np.array([0, 0, 1, 1, 0, 0, 0, 0])
     pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
     info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION)
     details = info["metric_details"]
     max_metrics = info["max_metrics"]
     conf = info["confusion_matrix"]
     self.assertTrue(max_metrics["f1"]["score"] < 1)
     self.assertTrue(max_metrics["mcc"]["score"] < 1)
Пример #10
0
 def test_compute_f1(self):
     target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
     pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
     info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION)
     details = info["metric_details"]
     max_metrics = info["max_metrics"]
     conf = info["confusion_matrix"]
     self.assertEqual(max_metrics["f1"]["score"], 1)
     self.assertTrue(details is not None)
     self.assertTrue(conf is not None)
Пример #11
0
 def test_compute(self):
     target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
     pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8])
     info = AdditionalMetrics.compute(target, pred, BINARY_CLASSIFICATION)
     details = info["metric_details"]
     max_metrics = info["max_metrics"]
     conf = info["confusion_matrix"]
     self.assertEqual(conf.iloc[0, 0], 3)
     self.assertEqual(conf.iloc[1, 1], 3)
     self.assertTrue(details is not None)
     self.assertTrue(max_metrics is not None)
Пример #12
0
    def get_additional_metrics(self):
        if self._additional_metrics is None:
            logger.debug("Get additional metrics for Ensemble")
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            print(oof_predictions)
            print(prediction_cols)
            print(target_cols)

            # need to prepare label for multiclass
            # print("ensemble")
            # print(oof_predictions[prediction_cols])
            # oof_preds = self.preprocessings[0].prepare_target_labels(
            #    oof_predictions[prediction_cols].values
            # )
            oof_preds = oof_predictions[prediction_cols]
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                cols = oof_preds.columns.tolist()
                # prediction_
                labels = {i: v[11:] for i, v in enumerate(cols)}

                oof_preds["label"] = np.argmax(np.array(
                    oof_preds[prediction_cols]),
                                               axis=1)
                oof_preds["label"] = oof_preds["label"].map(labels)

            self._additional_metrics = AdditionalMetrics.compute(
                oof_predictions[target_cols],
                oof_preds,  # oof_predictions[prediction_cols],
                self._ml_task,
            )
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])
                print(self._additional_metrics["max_metrics"])
                print(self._threshold)
        return self._additional_metrics