} } } scores = [] models = {} for model_name, mp in model_params.items(): clf = GridSearchCV(mp["model"], mp["params"], cv=6, return_train_score=False) clf.fit(X_train, y_train) scores.append({ "model_name": model_name, "best_score": clf.best_score_, "best_params": clf.best_params_ }) models[model_name] = clf scores_df = pd.DataFrame( scores, columns=['model_name', 'best_score', 'best_params']) scores_df.sort_values(by='best_score', ascending=False, inplace=True) print(scores_df) best_model_name = scores_df.iloc[0]['model_name'] best_clf = models[best_model_name] model_filepath = args['model'] print(f'≫ Storing "{best_model_name}" in "{model_filepath}"') store_model(best_clf, model_filepath)
parameters = { 'fit_intercept': [True, False], 'normalize': [True, False], } model = GridSearchCV(linreg, param_grid=parameters) model.fit(X_train, y_train) print('> Best parameters:', model.best_params_) y_pred = model.predict(X_test) r2 = evaluate_regression(y_test, y_pred) models[r2] = model.best_estimator_ print('Random Forest') forest = RandomForestRegressor() parameters = { 'n_estimators': [75, 100, 300], 'max_depth': [2, 5, None], } model = GridSearchCV(forest, param_grid=parameters) model.fit(X_train, y_train) print('> Best parameters:', model.best_params_) y_pred = model.predict(X_test) r2 = evaluate_regression(y_test, y_pred) models[r2] = model.best_estimator_ best = models[max(models)] model_filepath = args['model'] print(f'≫ Storing Model {type(best)} in "{model_filepath}"') store_model(best, model_filepath)
def test_filepath_format(): with pytest.raises(ValueError, match="filepath should end with specific extension"): store_model(model, 'filepath')
def test_new_number_of_columns(dump_mock): store_model(model, 'filepath.pkl') dump_mock.assert_called_once_with(model, 'filepath.pkl')
def test_model_datatype_error(): with pytest.raises(TypeError, match="model should be a sklearn base estimator"): store_model(None, 'filepath')