예제 #1
0
 def __init__(self, optimize_metric="logloss"):
     self.library_version = "0.1"
     self.uid = str(uuid.uuid4())
     self.model_file = self.uid + ".ensemble.model"
     self.model_file_path = os.path.join(storage_path, self.model_file)
     self.metric = Metric({"name": optimize_metric})
     self.best_loss = self.metric.get_maximum(
     )  # the best loss obtained by ensemble
     self.models = None
     self.selected_models = []
     self.train_time = None
     self.total_best_sum = None  # total sum of predictions, the oof of ensemble
     self.target = None
 def test_copy(self):
     # train model #1
     metric = Metric({"name": "logloss"})
     cat = CatBoostLearner(self.params)
     cat.fit(self.X, self.y)
     y_predicted = cat.predict(self.X)
     loss = metric(self.y, y_predicted)
     # create model #2
     cat2 = CatBoostLearner(self.params)
     # model #2 is initialized in constructor
     self.assertTrue(cat2.model is not None)
     # do a copy and use it for predictions
     cat2 = cat.copy()
     self.assertEqual(type(cat), type(cat2))
     y_predicted = cat2.predict(self.X)
     loss2 = metric(self.y, y_predicted)
     self.assertEqual(loss, loss2)
     # fit model #1, there should be improvement in loss
     cat.fit(self.X, self.y)
     y_predicted = cat.predict(self.X)
     loss3 = metric(self.y, y_predicted)
     self.assertTrue(loss3 < loss)
     # the loss of model #2 should not change
     y_predicted = cat2.predict(self.X)
     loss4 = metric(self.y, y_predicted)
     assert_almost_equal(loss2, loss4)
    def test_copy(self):
        metric = Metric({"name": "logloss"})
        params = {"objective": "binary:logistic", "eval_metric": "logloss"}
        xgb = XgbLearner(params)
        xgb.fit(self.X, self.y)
        y_predicted = xgb.predict(self.X)
        loss = metric(self.y, y_predicted)

        xgb2 = XgbLearner(params)
        self.assertTrue(
            xgb2.model is None)  # model is set to None, while initialized
        xgb2 = xgb.copy()
        self.assertEqual(type(xgb), type(xgb2))
        y_predicted = xgb2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        self.assertEqual(loss, loss2)

        xgb.fit(self.X, self.y)
        y_predicted = xgb.predict(self.X)
        loss3 = metric(self.y, y_predicted)
        self.assertTrue(loss3 < loss)

        y_predicted = xgb2.predict(self.X)
        loss4 = metric(self.y, y_predicted)
        assert_almost_equal(loss2, loss4)
예제 #4
0
 def __init__(self, params):
     super(MetricLogger, self).__init__(params)
     self.name = params.get("name", "metric_logger")
     self.loss_values = {}
     self.metrics = []
     for metric_name in params.get("metric_names"):
         self.metrics += [Metric({"name": metric_name})]
예제 #5
0
    def test_fit_and_predict(self):
        metric = Metric({"name": "logloss"})

        automl = AutoML(
            total_time_limit=5,
            algorithms=["Xgboost"],
            start_random_models=5,
            hill_climbing_steps=0,
            seed=13,
        )
        automl.fit(self.X, self.y)

        y_predicted = automl.predict(self.X)["p_1"]
        self.assertTrue(y_predicted is not None)
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.7)

        params = automl.to_json()
        automl2 = AutoML()
        automl2.from_json(params)

        y_predicted2 = automl2.predict(self.X)["p_1"]
        self.assertTrue(y_predicted2 is not None)
        loss2 = metric(self.y, y_predicted2)
        self.assertTrue(loss2 < 0.7)

        assert_almost_equal(automl._threshold, automl2._threshold)
예제 #6
0
 def test_copy(self):
     # train model #1
     metric = Metric({"name": "logloss"})
     nn = NeuralNetworkLearner(self.params)
     nn.fit(self.X, self.y)
     y_predicted = nn.predict(self.X)
     loss = metric(self.y, y_predicted)
     # create model #2
     nn2 = NeuralNetworkLearner(self.params)
     # model #2 is not initialized in constructor
     self.assertTrue(nn2.model is None)
     # do a copy and use it for predictions
     nn2 = nn.copy()
     self.assertEqual(type(nn), type(nn2))
     y_predicted = nn2.predict(self.X)
     loss2 = metric(self.y, y_predicted)
     self.assertEqual(loss, loss2)
     # fit model #1, there should be improvement in loss
     nn.fit(self.X, self.y)
     y_predicted = nn.predict(self.X)
     loss3 = metric(self.y, y_predicted)
     self.assertTrue(loss3 < loss)
     # the loss of model #2 should not change
     y_predicted = nn2.predict(self.X)
     loss4 = metric(self.y, y_predicted)
     assert_almost_equal(loss2, loss4)
예제 #7
0
    def test_save_and_load(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)
        y_predicted = il.predict(self.data["train"]["X"])
        metric = Metric({"name": "logloss"})
        loss_1 = metric(self.data["train"]["y"], y_predicted)

        json_desc = il.to_json()

        il2 = IterativeLearner(self.train_params, callbacks=[])
        self.assertTrue(il.uid != il2.uid)
        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        y_predicted_2 = il2.predict(self.data["train"]["X"])
        loss_2 = metric(self.data["train"]["y"], y_predicted_2)

        assert_almost_equal(loss_1, loss_2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
 def test_copy(self):
     # train model #1
     metric = Metric({"name": "logloss"})
     lgb = LightgbmLearner(self.params)
     lgb.fit(self.X, self.y)
     y_predicted = lgb.predict(self.X)
     loss = metric(self.y, y_predicted)
     # create model #2
     lgb2 = LightgbmLearner(self.params)
     # model #2 is set to None, while initialized
     self.assertTrue(lgb2.model is None)
     # do a copy and use it for predictions
     lgb2 = lgb.copy()
     self.assertEqual(type(lgb), type(lgb2))
     y_predicted = lgb2.predict(self.X)
     loss2 = metric(self.y, y_predicted)
     self.assertEqual(loss, loss2)
     # fit model #1, there should be improvement in loss
     lgb.fit(self.X, self.y)
     y_predicted = lgb.predict(self.X)
     loss3 = metric(self.y, y_predicted)
     self.assertTrue(loss3 < loss)
     # the loss of model #2 should not change
     y_predicted = lgb2.predict(self.X)
     loss4 = metric(self.y, y_predicted)
     assert_almost_equal(loss2, loss4)
예제 #9
0
    def test_fit_and_predict_kfold(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        params = copy.deepcopy(self.train_params)
        params["validation"] = {
            "validation_type": "kfold",
            "k_folds": 5,
            "shuffle": True,
        }
        il = IterativeLearner(params, callbacks=[early_stop, metric_logger])
        il.train(self.data)
        oof = il.get_out_of_folds()

        self.assertEqual(len(np.unique(oof.index)), oof.shape[0])
        self.assertTrue(
            np.array_equal(oof.index, self.data["train"]["X"].index))
        self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0])

        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
    def test_fit_and_predict(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
예제 #11
0
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"trees_in_step": 50}
        rf = RandomForestLearner(params)

        rf.fit(self.X, self.y)
        y_predicted = rf.predict(self.X)
        self.assertTrue(metric(self.y, y_predicted) < 0.6)
예제 #12
0
    def __init__(self, params):
        super(EarlyStopping, self).__init__(params)
        self.name = params.get("name", "early_stopping")
        self.metric = Metric(params.get("metric"))
        self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5)

        self.keep_best_model = params.get("keep_best_model", True)
        self.best_loss = {}
        self.loss_values = {}
        self.best_models = {}
        self.best_y_predicted = {}
        self.best_y_oof = (
            None)  # predictions computed on out of folds or on validation set
        self.final_loss = (
            None
        )  # final score computed on combined predictions from all learners
        # path to best model local copy, only used if cannot deep copy
        self.best_model_paths = {}
예제 #13
0
 def __init__(self):
     self.library_version = "0.1"
     self.uid = str(uuid.uuid4())
     self.model_file = self.uid + ".ensemble.model"
     self.model_file_path = "/tmp/" + self.model_file
     # right now only logloss can be optimized by ensemble
     self.metric = Metric({"name": "logloss"})
     self.best_loss = 10e12  # the best loss obtained by ensemble
     self.models = None
     self.selected_models = []
     self.train_time = None
예제 #14
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     nn = NeuralNetworkLearner(self.params)
     loss_prev = None
     for i in range(5):
         nn.fit(self.X, self.y)
         y_predicted = nn.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             self.assertTrue(loss + 0.000001 < loss_prev)
         loss_prev = loss
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     cat = CatBoostLearner(self.params)
     loss_prev = None
     for i in range(5):
         cat.fit(self.X, self.y)
         y_predicted = cat.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             self.assertTrue(loss + 0.001 < loss_prev)
         loss_prev = loss
    def test_fit_and_predict(self):

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
예제 #17
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     params = {"trees_in_step": 1, "seed": 1}
     prev_loss = None
     for i in range(3):
         model = RandomForestLearner(params)
         model.fit(self.X, self.y)
         y_predicted = model.predict(self.X)
         loss = metric(self.y, y_predicted)
         if prev_loss is not None:
             assert_almost_equal(prev_loss, loss)
         prev_loss = loss
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"trees_in_step": 1}
        rf = RandomForestLearner(params)

        loss_prev = None
        for i in range(2):
            rf.fit(self.X, self.y)
            y_predicted = rf.predict(self.X)
            loss = metric(self.y, y_predicted)
            if loss_prev is not None:
                self.assertTrue(loss + 0.00001 < loss_prev)
            loss_prev = loss
예제 #19
0
 def __init__(self):
     self.library_version = "0.1"
     self.uid = str(uuid.uuid4())
     self.model_file = self.uid + ".ensemble.model"
     self.model_file_path = os.path.join(storage_path, self.model_file)
     # right now only logloss can be optimized by ensemble
     self.metric = Metric({"name": "logloss"})
     self.best_loss = 10e12  # the best loss obtained by ensemble
     self.models = None
     self.selected_models = []
     self.train_time = None
     self.total_best_sum = None  # total sum of predictions, the oof of ensemble
     self.target = None
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"objective": "binary:logistic", "eval_metric": "logloss"}
        xgb = XgbLearner(params)

        loss_prev = None
        for i in range(5):
            xgb.fit(self.X, self.y)
            y_predicted = xgb.predict(self.X)
            loss = metric(self.y, y_predicted)
            if loss_prev is not None:
                self.assertTrue(loss + 0.001 < loss_prev)
            loss_prev = loss
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})

        lgb = LightgbmLearner(self.params)

        loss_prev = None
        for i in range(5):
            lgb.fit(self.X, self.y)
            y_predicted = lgb.predict(self.X)
            loss = metric(self.y, y_predicted)
            if loss_prev is not None:
                self.assertTrue(loss + 0.001 < loss_prev)
            loss_prev = loss
    def test_fit_and_predict_split(self):
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger])
        il.train(self.data)

        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     params = {
         "objective": "binary:logistic",
         "eval_metric": "logloss",
         "seed": 1
     }
     prev_loss = None
     for i in range(3):
         xgb = XgbLearner(params)
         xgb.fit(self.X, self.y)
         y_predicted = xgb.predict(self.X)
         loss = metric(self.y, y_predicted)
         if prev_loss is not None:
             assert_almost_equal(prev_loss, loss)
         prev_loss = loss
    def test_save_and_load(self):
        metric = Metric({"name": "logloss"})
        rf = RandomForestLearner({})
        rf.fit(self.X, self.y)
        y_predicted = rf.predict(self.X)
        loss = metric(self.y, y_predicted)

        json_desc = rf.save()
        rf2 = RandomForestLearner({})
        self.assertTrue(rf.uid != rf2.uid)
        rf2.load(json_desc)
        self.assertTrue(rf.uid == rf2.uid)

        y_predicted = rf2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        assert_almost_equal(loss, loss2)
    def test_save_and_load(self):
        metric = Metric({"name": "logloss"})
        lgb = LightgbmLearner(self.params)
        lgb.fit(self.X, self.y)
        y_predicted = lgb.predict(self.X)
        loss = metric(self.y, y_predicted)

        json_desc = lgb.save()
        lgb2 = LightgbmLearner({})
        self.assertTrue(lgb.uid != lgb2.uid)
        self.assertTrue(lgb2.model is None)
        lgb2.load(json_desc)
        self.assertTrue(lgb.uid == lgb2.uid)

        y_predicted = lgb2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        assert_almost_equal(loss, loss2)
    def test_save_and_load(self):
        metric = Metric({"name": "logloss"})
        cat = CatBoostLearner(self.params)
        cat.fit(self.X, self.y)
        y_predicted = cat.predict(self.X)
        loss = metric(self.y, y_predicted)

        json_desc = cat.save()
        cat2 = CatBoostLearner({})
        self.assertTrue(cat.uid != cat2.uid)
        self.assertTrue(cat2.model is not None)
        cat2.load(json_desc)
        self.assertTrue(cat.uid == cat2.uid)

        y_predicted = cat2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        assert_almost_equal(loss, loss2)
예제 #27
0
    def test_save_and_load(self):
        metric = Metric({"name": "logloss"})
        nn = NeuralNetworkLearner(self.params)
        nn.fit(self.X, self.y)
        y_predicted = nn.predict(self.X)
        loss = metric(self.y, y_predicted)

        json_desc = nn.save()
        nn2 = NeuralNetworkLearner({})
        self.assertTrue(nn.uid != nn2.uid)
        self.assertTrue(nn2.model is None)
        nn2.load(json_desc)
        self.assertTrue(nn.uid == nn2.uid)

        y_predicted = nn2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        assert_almost_equal(loss, loss2)
    def test_save_and_load(self):
        metric = Metric({"name": "logloss"})
        params = {"objective": "binary:logistic", "eval_metric": "logloss"}
        xgb = XgbLearner(params)
        xgb.fit(self.X, self.y)
        y_predicted = xgb.predict(self.X)
        loss = metric(self.y, y_predicted)

        json_desc = xgb.save()
        xgb2 = XgbLearner(params)
        self.assertTrue(xgb.uid != xgb2.uid)
        self.assertTrue(xgb2.model is None)
        xgb2.load(json_desc)
        self.assertTrue(xgb.uid == xgb2.uid)

        y_predicted = xgb2.predict(self.X)
        loss2 = metric(self.y, y_predicted)
        assert_almost_equal(loss, loss2)
예제 #29
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     losses = []
     for i in range(2):
         automl = AutoML(
             total_time_limit=
             10000,  # the time limit should be big enough too not interrupt the training
             algorithms=["Xgboost"],
             start_random_models=2,
             hill_climbing_steps=1,
             train_ensemble=True,
             verbose=True,
             seed=12,
         )
         automl.fit(self.X, self.y)
         y_predicted = automl.predict(self.X)["p_1"]
         loss = metric(self.y, y_predicted)
         losses += [loss]
     assert_almost_equal(losses[0], losses[1], decimal=4)
    def test_save_and_load(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        metric = Metric({"name": "logloss"})
        loss = metric(self.y, il.predict(self.X))

        json_desc = il.to_json()
        il2 = IterativeLearner(json_desc.get("params"), callbacks=[])
        self.assertTrue(il.uid != il2.uid)

        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        loss2 = metric(self.y, il2.predict(self.X))
        assert_almost_equal(loss, loss2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)