예제 #1
0
    def test_save_and_load(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)
        y_predicted = il.predict(self.data["train"]["X"])
        metric = Metric({"name": "logloss"})
        loss_1 = metric(self.data["train"]["y"], y_predicted)

        json_desc = il.to_json()

        il2 = IterativeLearner(self.train_params, callbacks=[])
        self.assertTrue(il.uid != il2.uid)
        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        y_predicted_2 = il2.predict(self.data["train"]["X"])
        loss_2 = metric(self.data["train"]["y"], y_predicted_2)

        assert_almost_equal(loss_1, loss_2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
예제 #2
0
    def test_fit_and_predict_kfold(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        params = copy.deepcopy(self.train_params)
        params["validation"] = {
            "validation_type": "kfold",
            "k_folds": 5,
            "shuffle": True,
        }
        il = IterativeLearner(params, callbacks=[early_stop, metric_logger])
        il.train(self.data)
        oof = il.get_out_of_folds()

        self.assertEqual(len(np.unique(oof.index)), oof.shape[0])
        self.assertTrue(
            np.array_equal(oof.index, self.data["train"]["X"].index))
        self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0])

        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
    def test_fit_and_predict(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
    def test_save_and_load(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        metric = Metric({"name": "logloss"})
        loss = metric(self.y, il.predict(self.X))

        json_desc = il.to_json()
        il2 = IterativeLearner(json_desc.get("params"), callbacks=[])
        self.assertTrue(il.uid != il2.uid)

        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        loss2 = metric(self.y, il2.predict(self.X))
        assert_almost_equal(loss, loss2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
    def test_fit_and_predict(self):

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
    def test_fit_and_predict_split(self):
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger])
        il.train(self.data)

        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
예제 #7
0
    def run(self):
        # read data
        logger.info("ComputeBatchPrediction::run")

        batch = MLBatchPrediction.objects.get(pk=self.job_params.get("db_id"))
        logger.info("batch", batch)
        # {'parent_mlmodel': 9, 'parent_dataframe': 1, 'db_id': 1, 'created_by_id': 1, 'parent_organization_id': 1, 'parent_project_id': 1}

        mlmodel = MLModel.objects.get(pk=self.job_params.get("parent_mlmodel"))
        logger.info(mlmodel.save_details)
        logger.info(mlmodel.all_params)
        il = IterativeLearner(mlmodel.all_params)
        il.load(mlmodel.save_details)
        logger.info(batch.parent_dataframe.absolute_path)
        input_df = DataServe.get(batch.parent_dataframe.absolute_path)

        predictions = il.predict(input_df)
        logger.info(predictions)

        filename = "predictions-{0}.csv".format(str(uuid.uuid4())[:8])
        organization_slug = batch.parent_organization.slug
        project_id = batch.parent_project.id
        relative_dir = "org_{0}_proj_{1}".format(organization_slug, project_id)
        relative_path = os.path.join(relative_dir, filename)
        result_absolute_path = Storage().get_path(relative_dir, filename)

        logger.info(result_absolute_path)

        df = pd.DataFrame({"prediction": predictions})
        df.to_csv(result_absolute_path, index=False)

        # create mljar data frame
        result_df = DataFrame(
            source_id=self.job_params.get("parent_dataframe"),  # fix this
            absolute_path=result_absolute_path,
            file_size=1,  # TODO fix the file size
            columns_details="",  # we can describe any data frame (always :-))
            preview_absolute_path="",
            created_by_id=self.job_params["created_by_id"],
            parent_organization_id=self.job_params["parent_organization_id"],
            parent_project_id=self.job_params["parent_project_id"],
        )
        result_df.save()

        batch.result_dataframe = result_df
        batch.status = "done"
        batch.save()