def run_model(params, run, X_train, X_test, y_train, y_test): # run model hyper_params = {} if params.estimator == "SVC": hyper_params["probability"] = True model = create_pipeline(model_estimator=MODELS[params.estimator]["fct"], params=hyper_params) model.fit(X_train, y_train) # save model model_file = os.path.join(module_path, "models", f"{params.model_name}.joblib") joblib.dump(model, model_file) # get CV metrics and test metrics and log them in Neptune if params.cv: cv_metrics = compute_metrics_cv(X_train, y_train, model) metrics = compute_metrics(X_test, y_test, model) res = "Not sent" if run is not None: if params.cv: record_metadata(cv_metrics, run) record_metadata(metrics, run) save_artifact(data_path=params.data_path, model_file=model_file, run=run) # notify user if params.email_address is not None: url = f"{run._backend.get_display_address()}/{os.getenv('NEPTUNE_USER')}/{os.getenv('NEPTUNE_PROJECT')}/e/{run['sys/id'].fetch()}" res = send_email(url, params.email_address) run.stop() return {'metrics' : metrics, "email_sent" : res}
def test_create_pipeline_params(): params = {'C': 50, 'gamma': 0.01} pipe = create_pipeline(params=params) assert isinstance(pipe.steps[0][1], NLPCleaner), FIRST_MSG assert isinstance(pipe.steps[1][1], TfidfVectorizer), SECOND_MSG assert isinstance(pipe.steps[2][1], SVC), THIRD_MSG
def grid_run_model(params, run, X_train, X_test, y_train, y_test): # run model list_metrics = ['precision', 'recall', 'accuracy', 'f1_weighted', 'roc_auc'] refit = "roc_auc" pipe = create_pipeline(model_estimator=MODELS[params.estimator]["fct"], params=None) model = run_grid_search(model=pipe, params=params.parameters, data=(X_train, y_train), metrics=list_metrics, refit=refit) # record best params if run is not None: run['best_params'] = model.best_params_ # collect cv_results and test metrics cv_results = get_grid_search_best_metrics(model, list_metrics) metrics = compute_metrics(X_test, y_test, model) # save model model_file = os.path.join(module_path, "models", f"{params.model_name}.joblib") joblib.dump(model, model_file) res = "Not sent" if run is not None: record_metadata(cv_results, run) record_metadata(metrics, run) save_artifact(data_path=params.data_path, model_file=model_file, run=run) # notify user if params.email_address is not None: url = f"{run._backend.get_display_address()}/{os.getenv('NEPTUNE_USER')}/{os.getenv('NEPTUNE_PROJECT')}/e/{run['sys/id'].fetch()}" res = send_email(url, params.email_address) run.stop() return {'metrics' : metrics, "email_sent" : res}
def test_run_grid_search(): data = pd.read_csv(data_path) X, y = split_data(data) params = { "clf__max_depth": [3], "clf__n_estimators": [50], "clf__class_weight": ['balanced'], "clf__random_state": [43]} pipe = create_pipeline(model_estimator=LGBMClassifier) list_metrics = ['precision', 'recall'] refit = "precision" grid_pipe = run_grid_search(model=pipe, params=params, data=(X, y), metrics=list_metrics, refit=refit) print(grid_pipe) assert isinstance(grid_pipe, GridSearchCV), "Should be a grid search" assert isinstance(grid_pipe.estimator, Pipeline) assert grid_pipe.param_grid == {'clf__class_weight': ['balanced'], 'clf__max_depth': [3], 'clf__n_estimators': [50], 'clf__random_state': [43]} assert grid_pipe.refit == 'precision'
test_size=0.2, random_state=43, stratify=y) # get model if model_file: model = get_model(model_file=model_file) hyper_params = model.steps[1][1].get_params() else: if estimator is None and estimator not in estimators.keys(): estimator = "SVC" model = estimators[estimator]["name"] if estimator == "SVC": hyper_params["probability"] = True model = create_pipeline(model_estimator=model, params=hyper_params) if grid_search: model_name = f"grid_search_{estimator}" hyper_params = estimators[estimator]["hyperparams"] if estimator == "SVC": hyper_params["clf__probability"] = [True] if run is not None: create_exp(hyper_params, tags, run) # run model list_metrics = [ 'precision', 'recall', 'accuracy', 'f1_weighted', 'roc_auc' ]
def test_create_pipeline(): pipe = create_pipeline() assert isinstance(pipe.steps[0][1], NLPCleaner), FIRST_MSG assert isinstance(pipe.steps[1][1], TfidfVectorizer), SECOND_MSG assert isinstance(pipe.steps[2][1], SVC), THIRD_MSG