예제 #1
0
    def load(model_path):
        logger.info(f"Loading model framework from {model_path}")

        json_desc = json.load(open(os.path.join(model_path, "framework.json")))
        mf = ModelFramework(json_desc["params"])
        mf.uid = json_desc.get("uid", mf.uid)
        mf._name = json_desc.get("name", mf._name)
        mf._threshold = json_desc.get("threshold")
        mf.train_time = json_desc.get("train_time", mf.train_time)
        mf.final_loss = json_desc.get("final_loss", mf.final_loss)
        mf.metric_name = json_desc.get("metric_name", mf.metric_name)
        mf._is_stacked = json_desc.get("is_stacked", mf._is_stacked)
        predictions_fname = json_desc.get("predictions_fname")
        if predictions_fname is not None:
            mf.oof_predictions = pd.read_csv(predictions_fname)

        mf.learners = []
        for learner_desc, learner_path in zip(json_desc.get("learners"),
                                              json_desc.get("saved")):

            l = AlgorithmFactory.load(learner_desc, learner_path)
            mf.learners += [l]

        mf.preprocessings = []
        for p in json_desc.get("preprocessing"):
            ps = Preprocessing()
            ps.from_json(p)
            mf.preprocessings += [ps]

        return mf
    def test_empty_column(self):
        # training data
        d = {
            "col1": [np.nan, np.nan, np.nan, np.nan],
            "col2": [5, 6, 7, 0],
            "col3": [1, 1, 1, 3],
            "col4": [2, 2, 4, 3],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}}

        ps = Preprocessing(preprocessing_params)
        X_train1, _ = ps.fit_and_transform(X_train, y_train)

        self.assertTrue("col1" not in X_train1.columns)
        self.assertEqual(3, len(X_train1.columns))
        X_train2, _ = ps.transform(X_train, y_train)
        self.assertTrue("col1" not in X_train2.columns)
        self.assertEqual(3, len(X_train2.columns))
        for col in ["col2", "col3", "col4"]:
            self.assertTrue(col in X_train2.columns)

        params_json = ps.to_json()
        ps2 = Preprocessing()
        ps2.from_json(params_json)

        X_train3, _ = ps2.transform(X_train, y_train)
        self.assertTrue("col1" not in X_train3.columns)
        self.assertEqual(3, len(X_train3.columns))
        for col in ["col2", "col3", "col4"]:
            self.assertTrue(col in X_train3.columns)
    def test_to_and_from_json_run_fill_median_convert_integer(self):
        # training data
        d = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [0, 1, 0, 1],
        }
        df = pd.DataFrame(data=d)
        X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
        y_train = df.loc[:, "y"]

        preprocessing_params = {
            "columns_preprocessing": {
                "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN],
                "col2": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
                "col4": [
                    PreprocessingMissingValues.FILL_NA_MEDIAN,
                    PreprocessingCategorical.CONVERT_INTEGER,
                ],
            },
            "target_preprocessing": [],
        }

        ps = Preprocessing(preprocessing_params)
        _, _ = ps.fit_and_transform(X_train, y_train)

        ps2 = Preprocessing()
        ps2.from_json(ps.to_json())
        del ps

        d_test = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [np.nan, np.nan, 1, 1],
        }
        df_test = pd.DataFrame(data=d_test)
        X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
        y_test = df_test.loc[:, "y"]

        X_test, y_test = ps2.transform(X_test, y_test)

        self.assertEqual(2, y_test.shape[0])
        self.assertEqual(2, np.sum(y_test))
        self.assertEqual(1, X_test["col1"].iloc[0])
        self.assertEqual(0, X_test["col2"].iloc[0])
예제 #4
0
    def load(model_path):
        logger.info(f"Loading model framework from {model_path}")

        json_desc = json.load(open(os.path.join(model_path, "framework.json")))
        mf = ModelFramework(json_desc["params"])
        mf.uid = json_desc.get("uid", mf.uid)
        mf._name = json_desc.get("name", mf._name)
        mf._threshold = json_desc.get("threshold")
        mf.learners = []
        for learner_desc, learner_path in zip(json_desc.get("learners"),
                                              json_desc.get("saved")):

            l = AlgorithmFactory.load(learner_desc, learner_path)
            mf.learners += [l]

        mf.preprocessings = []
        for p in json_desc.get("preprocessing"):
            ps = Preprocessing()
            ps.from_json(p)
            mf.preprocessings += [ps]

        return mf