def run_training() -> None: '''better train (implementing cv to avoid losing the data for validation''' # divide dataset data = load_dataset(file_name=config.TRAIN_DATA_FN) X_train = data[config.TRAIN_FEATURES].sample(frac=1, random_state=42).drop(columns=config.TARGET) y_train = data[config.TRAIN_FEATURES].sample(frac=1, random_state=42)[config.TARGET[0]] pipeline.prep_pipeline.fit(X_train, y_train) # prep input prep_train = pipeline.prep_pipeline.transform(X_train) # fit np.random.seed(42) pipeline.estimator_cv.fit(prep_train, y_train) full_pipeline = Pipeline([ ('prep_pipeline', pipeline.prep_pipeline), ('estimator', pipeline.estimator_cv.best_estimator_) ]) # print reulsts print(f'best estimator: {pipeline.estimator_cv.best_estimator_}') print(f'best estimator cv score: {pipeline.estimator_cv.best_score_}') print(f'best estimator score: {pipeline.estimator_cv.best_estimator_.score(prep_train, y_train)}') print(f'full pipe score: {full_pipeline.score(X_train, y_train)}') _logger.info(f"save model version: {_version}") save_pipeline(pipeline=full_pipeline)
def test_predict_proba(): file_name = 'test.csv' data = load_dataset(file_name=file_name) test_json = data[0:1].to_json(orient='records') test_results = predict.make_prediction_proba(input_data=test_json) assert test_results is not None assert len(test_results['predictions'][0]) == 2 assert test_results['predictions'][0][0] <= 1 assert test_results['predictions'][0][0] >= 0
def test_predict(): file_name = 'test.csv' data = load_dataset(file_name=file_name) test_json = data[0:1].to_json(orient='records') test_results = predict.make_prediction(input_data=test_json) assert test_results is not None assert isinstance(test_results['predictions'][0], int) assert test_results['predictions'][0] <= 1 assert test_results['predictions'][0] >= 0
def test_prediction_endpoint(flask_test_client): data = load_dataset(file_name=config.TEST_FILENAME) post_json = data[0:1].to_json(orient='records') _logger.info(f'POST JSON in test prediction endpoint: {post_json}') _logger.info( f'type of POST JSON in test prediction endpoint: {type(post_json)}') response = flask_test_client.post('/v1/predict/lasso', json=post_json) assert response.status_code == 200 response_json = json.loads(response.data) response_pred = response_json.get('predictions') response_version = response_json.get('version') assert response_pred[0] <= 1 and response_pred[0] >= 0 assert response_version == model_version
def make_prediction(*, input_data: json) -> dict: data = pd.read_json(input_data) data = validate_data(data) pred = pipeline.predict(data[config.FEATURES]) _logger.info(f"Making prediction with model version: {_version} " f"Inputs: {data} " f"Predictions: {pred}") response = {'predictions': pred.tolist(), "version": _version} return response def make_prediction_proba(*, input_data) -> dict: data = pd.read_json(input_data) pred = pipeline.predict_proba(data[config.FEATURES]) response = {'predictions': pred.tolist(), "version": _version} return response if __name__ == '__main__': data = load_dataset(file_name=config.TRAIN_DATA_FN) X_train = data[config.TRAIN_FEATURES].sample( frac=1, random_state=42).drop(columns=config.TARGET) y_train = data[config.TRAIN_FEATURES].sample( frac=1, random_state=42)[config.TARGET[0]] # load model print(pipeline.score(X_train, y_train))
import json import pandas as pd import numpy as np from lasso.predict import make_prediction, make_prediction_proba from lasso import config from lasso.preprocess.data_management import load_dataset from myapi import config as api_config from myapi.logger_config import get_logger from myapi.config import PREV_VER_PREDS_FILENAME test_dataset = load_dataset(file_name=api_config.TEST_FILENAME) _logger = get_logger(logger_name=__name__) def capture_prev_ver_predictions(): df = test_dataset[111:444].copy() test_json = df.to_json(orient='records') test_preds = make_prediction_proba(input_data=test_json) test_pred = pd.DataFrame(test_preds['predictions']) test_pred_version = test_preds['version'] test_pred.to_csv(api_config.PACKAGE_ROOT / api_config.PREV_VER_PREDS_FILENAME, index=False) _logger.debug(f'save previous version {test_pred_version} \ predictions: {api_config.PACKAGE_ROOT / api_config.PREV_VER_PREDS_FILENAME}' )
import json import pandas as pd import numpy as np import pytest from tests.conftest import flask_test_client from myapi.config import PACKAGE_ROOT, PREV_VER_PREDS_FILENAME, ACCETPTABLE_DIFF from myapi.logger_config import get_logger from lasso import config from lasso.predict import make_prediction_proba from lasso.preprocess.data_management import load_dataset test_dataset = load_dataset(file_name=config.TEST_DATA_FN) _logger = get_logger(logger_name=__name__) @pytest.mark.differential def test_differential(flask_test_client): prev_ver_test_pred = pd.read_csv(PACKAGE_ROOT / PREV_VER_PREDS_FILENAME) _logger.info('load previous version test predictions') df = test_dataset[111:444].copy() test_json = df.to_json(orient='records') _logger.info(f'Test differential Inputs: {test_json}') response = flask_test_client.post('/v1/predict_proba/lasso', json = test_json) pred_json = response.data
return validated_data def validate_inputs(*, input_data: json): data_schema = DataSchema(many=True) input_dic = json.loads(input_data) errors = None try: results = data_schema.load(input_dic) except ValidationError as exc: errors = exc.messages # filter input if errors: validated_data = _filter_error_inputs(errors, input_dic) validated_data = json.dumps(validated_data) return validated_data, errors return input_data, errors if __name__ == '__main__': data = load_dataset(file_name=config.TEST_FILENAME) post_json = data[111:444].to_json(orient='records') val_data, err = validate_inputs(input_data=post_json) val_df = pd.read_json(val_data) print(val_data)