Пример #1
0
def run_model(params, run, X_train, X_test, y_train, y_test):
    # run model
    hyper_params = {}
    if params.estimator == "SVC":
        hyper_params["probability"] = True
    model = create_pipeline(model_estimator=MODELS[params.estimator]["fct"], params=hyper_params)
    model.fit(X_train, y_train)


    # save model
    model_file = os.path.join(module_path, "models", f"{params.model_name}.joblib")
    joblib.dump(model, model_file)

    # get CV metrics and test metrics and log them in Neptune
    if params.cv:
        cv_metrics = compute_metrics_cv(X_train, y_train, model)
    metrics = compute_metrics(X_test, y_test, model)

    res = "Not sent"

    if run is not None:
        if params.cv:
            record_metadata(cv_metrics, run)
        record_metadata(metrics, run)
        save_artifact(data_path=params.data_path, model_file=model_file, run=run)

        # notify user
        if params.email_address is not None:
            url = f"{run._backend.get_display_address()}/{os.getenv('NEPTUNE_USER')}/{os.getenv('NEPTUNE_PROJECT')}/e/{run['sys/id'].fetch()}"
            res = send_email(url, params.email_address)

        run.stop()

    return {'metrics' : metrics, "email_sent" : res}
def test_create_pipeline_params():
    params = {'C': 50, 'gamma': 0.01}
    pipe = create_pipeline(params=params)

    assert isinstance(pipe.steps[0][1], NLPCleaner), FIRST_MSG
    assert isinstance(pipe.steps[1][1], TfidfVectorizer), SECOND_MSG
    assert isinstance(pipe.steps[2][1], SVC), THIRD_MSG
Пример #3
0
def grid_run_model(params, run, X_train, X_test, y_train, y_test):
    # run model
    list_metrics = ['precision', 'recall', 'accuracy', 'f1_weighted', 'roc_auc']
    refit = "roc_auc"

    pipe = create_pipeline(model_estimator=MODELS[params.estimator]["fct"], params=None)

    model = run_grid_search(model=pipe,
                            params=params.parameters,
                            data=(X_train, y_train),
                            metrics=list_metrics,
                            refit=refit)

    # record best params
    if run is not None:
        run['best_params'] = model.best_params_

    # collect cv_results and test metrics
    cv_results = get_grid_search_best_metrics(model, list_metrics)
    metrics = compute_metrics(X_test, y_test, model)

    # save model
    model_file = os.path.join(module_path, "models", f"{params.model_name}.joblib")
    joblib.dump(model, model_file)

    res = "Not sent"

    if run is not None:
        record_metadata(cv_results, run)
        record_metadata(metrics, run)
        save_artifact(data_path=params.data_path, model_file=model_file, run=run)

        # notify user
        if params.email_address is not None:
            url = f"{run._backend.get_display_address()}/{os.getenv('NEPTUNE_USER')}/{os.getenv('NEPTUNE_PROJECT')}/e/{run['sys/id'].fetch()}"
            res = send_email(url, params.email_address)

        run.stop()

    return {'metrics' : metrics, "email_sent" : res}
def test_run_grid_search():
    data = pd.read_csv(data_path)
    X, y = split_data(data)
    params = {
        "clf__max_depth": [3],
        "clf__n_estimators": [50],
        "clf__class_weight": ['balanced'],
        "clf__random_state": [43]}

    pipe = create_pipeline(model_estimator=LGBMClassifier)
    list_metrics = ['precision', 'recall']
    refit = "precision"

    grid_pipe = run_grid_search(model=pipe, params=params, data=(X, y), metrics=list_metrics, refit=refit)
    print(grid_pipe)

    assert isinstance(grid_pipe, GridSearchCV), "Should be a grid search"
    assert isinstance(grid_pipe.estimator, Pipeline)
    assert grid_pipe.param_grid == {'clf__class_weight': ['balanced'],
                                    'clf__max_depth': [3],
                                    'clf__n_estimators': [50],
                                    'clf__random_state': [43]}

    assert grid_pipe.refit == 'precision'
                                                        test_size=0.2,
                                                        random_state=43,
                                                        stratify=y)

    # get model
    if model_file:
        model = get_model(model_file=model_file)
        hyper_params = model.steps[1][1].get_params()
    else:
        if estimator is None and estimator not in estimators.keys():
            estimator = "SVC"

        model = estimators[estimator]["name"]
        if estimator == "SVC":
            hyper_params["probability"] = True
        model = create_pipeline(model_estimator=model, params=hyper_params)

    if grid_search:
        model_name = f"grid_search_{estimator}"
        hyper_params = estimators[estimator]["hyperparams"]

        if estimator == "SVC":
            hyper_params["clf__probability"] = [True]

        if run is not None:
            create_exp(hyper_params, tags, run)

        # run model
        list_metrics = [
            'precision', 'recall', 'accuracy', 'f1_weighted', 'roc_auc'
        ]
def test_create_pipeline():
    pipe = create_pipeline()

    assert isinstance(pipe.steps[0][1], NLPCleaner), FIRST_MSG
    assert isinstance(pipe.steps[1][1], TfidfVectorizer), SECOND_MSG
    assert isinstance(pipe.steps[2][1], SVC), THIRD_MSG