示例#1
0
                def hyperopt_objective(search_params):
                    # This function is called for each set of hyper-parameters being tested by HyperOpt.
                    run_name = str(len(trials) - 1)
                    ho_params = {}
                    ho_tags = {}
                    ho_metrics = {}
                    ho_artifacts = {}

                    search_params = flatten_params(search_params)
                    search_params = prep_params(search_params)
                    ho_estimator_params = estimator_params.copy()
                    ho_estimator_params.update(search_params)

                    with mlflow.start_run(nested=True, run_name=run_name):
                        ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run(
                            estimator_params=ho_estimator_params,
                            x_train_prep=x_train_prep,
                            y_train=y_train,
                            x_test_prep=x_test_prep,
                            y_test=y_test,
                            temp_dir=temp_dir / run_name)

                        ho_model = make_pipeline(preprocessor, ho_estimator)
                        ho_params.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_params.items()
                        })
                        ho_tags.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_tags.items()
                        })
                        ho_metrics.update(ho_estimator_metrics)
                        ho_artifacts.update(ho_estimator_artifacts)

                        ho_tags['hyperopt'] = True

                        log_sk_model(ho_model,
                                     registered_model_name=None,
                                     params=ho_params,
                                     tags=ho_tags,
                                     metrics=ho_metrics,
                                     artifacts=ho_artifacts)

                    loss = 1 - ho_metrics[config.evaluation.primary_metric]

                    return {
                        'loss': loss,
                        'status': STATUS_OK,
                        'model': ho_model,
                        'params': ho_params,
                        'tags': ho_tags,
                        'metrics': ho_metrics,
                        'artifacts': ho_artifacts
                    }
示例#2
0
def calibrate_model(run_id, x, y):
    with tempfile.TemporaryDirectory() as td:
        _logger.debug(f"Creating temporary directory: '{td}'")
        temp_dir = Path(td)

        params = {}
        tags = {}
        metrics = {}
        artifacts = {}

        _logger.info("Loading the model")
        model = get_model(run_id, model_path='model')

        with mlflow.start_run(run_id):
            _logger.info("Preprocessing the training data")
            preprocessor = model.steps[0][1]
            x_prep = preprocessor.transform(x)

            # calibrate model
            calibrated_estimator = CalibratedClassifierCV(model.steps[1][1], cv='prefit')
            calibrated_estimator.fit(x_prep, y)

            # evaluate the model
            estimator_metrics, estimator_artifacts = evaluate_calibration(
                model=calibrated_estimator,
                data={'test': {'x': x_prep, 'y': y}},
                temp_dir=temp_dir)

            estimator_params = {}
            estimator_tags = {'calibrated': True}

            calibrated_model = make_pipeline(preprocessor, calibrated_estimator)
            params.update({f"estimator_{k}": v for k, v in estimator_params.items()})
            tags.update({f"estimator_{k}": v for k, v in estimator_tags.items()})
            metrics.update(estimator_metrics)
            artifacts.update(estimator_artifacts)

            log_sk_model(calibrated_model,
                         registered_model_name=None,
                         params=params,
                         tags=tags,
                         metrics=metrics,
                         artifacts=artifacts,
                         model_artifact_path='model_calibrated')

    return (x, y), model, params, tags, metrics, artifacts
示例#3
0
def main():
    load_dotenv('.env.general')
    config = load_config('config.yml')
    Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    Path(config.logging.handlers.info_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    logging.config.dictConfig(config.logging)

    _logger.info("Loading the data")
    x, y = load_training_data()
    x_train, x_test, y_train, y_test = split_data(x, y)

    with tempfile.TemporaryDirectory() as td:
        temp_dir = Path(td)
        mlflow.set_experiment(config.experiment.name)

        params = {}
        tags = {}
        metrics = {}
        artifacts = {}

        with mlflow.start_run():
            _logger.info("Fitting the preprocessor")
            preprocessor = get_preprocessor()
            preprocessor.fit(x_train, y_train)

            _logger.info("Preprocessing the training data")
            x_train_prep = preprocessor.transform(x_train)
            x_test_prep = preprocessor.transform(x_test)

            estimator_params, search_space = get_params()

            if search_space is None:
                estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run(
                    estimator_params=estimator_params,
                    x_train_prep=x_train_prep,
                    y_train=y_train,
                    x_test_prep=x_test_prep,
                    y_test=y_test,
                    temp_dir=temp_dir)

                model = make_pipeline(preprocessor, estimator)
                params.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_params.items()})
                tags.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_tags.items()})
                metrics.update(estimator_metrics)
                artifacts.update(estimator_artifacts)

            else:

                def hyperopt_objective(search_params):
                    # This function is called for each set of hyper-parameters being tested by HyperOpt.
                    run_name = str(len(trials) - 1)
                    ho_params = {}
                    ho_tags = {}
                    ho_metrics = {}
                    ho_artifacts = {}

                    search_params = flatten_params(search_params)
                    search_params = prep_params(search_params)
                    ho_estimator_params = estimator_params.copy()
                    ho_estimator_params.update(search_params)

                    with mlflow.start_run(nested=True, run_name=run_name):
                        ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run(
                            estimator_params=ho_estimator_params,
                            x_train_prep=x_train_prep,
                            y_train=y_train,
                            x_test_prep=x_test_prep,
                            y_test=y_test,
                            temp_dir=temp_dir / run_name)

                        ho_model = make_pipeline(preprocessor, ho_estimator)
                        ho_params.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_params.items()
                        })
                        ho_tags.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_tags.items()
                        })
                        ho_metrics.update(ho_estimator_metrics)
                        ho_artifacts.update(ho_estimator_artifacts)

                        ho_tags['hyperopt'] = True

                        log_sk_model(ho_model,
                                     registered_model_name=None,
                                     params=ho_params,
                                     tags=ho_tags,
                                     metrics=ho_metrics,
                                     artifacts=ho_artifacts)

                    loss = 1 - ho_metrics[config.evaluation.primary_metric]

                    return {
                        'loss': loss,
                        'status': STATUS_OK,
                        'model': ho_model,
                        'params': ho_params,
                        'tags': ho_tags,
                        'metrics': ho_metrics,
                        'artifacts': ho_artifacts
                    }

                trials = Trials()
                fmin(fn=hyperopt_objective,
                     space=search_space,
                     algo=tpe.suggest,
                     trials=trials,
                     max_evals=config.training.max_evals,
                     rstate=np.random.RandomState(1),
                     show_progressbar=False)

                model = trials.best_trial['result']['model']
                params = trials.best_trial['result']['params']
                tags = trials.best_trial['result']['tags']
                metrics = trials.best_trial['result']['metrics']
                artifacts = trials.best_trial['result']['artifacts']

            if config.evaluation.shap_analysis:
                _logger.info("Starting shap analysis")
                shap_tags, shap_artifacts = shap_analyse(
                    model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap')
                tags.update(shap_tags)
                artifacts.update(shap_artifacts)
            else:
                _logger.info("Shap analysis skipped")

            log_sk_model(model,
                         registered_model_name=None,
                         params=params,
                         tags=tags,
                         metrics=metrics,
                         artifacts=artifacts)

    return (x_train, y_train, x_test,
            y_test), model, params, tags, metrics, artifacts