예제 #1
0
def run_training() -> None:
    '''better train (implementing cv to avoid losing the data for validation'''

    # divide dataset
    data = load_dataset(file_name=config.TRAIN_DATA_FN)

    X_train = data[config.TRAIN_FEATURES].sample(frac=1, random_state=42).drop(columns=config.TARGET)
    y_train = data[config.TRAIN_FEATURES].sample(frac=1, random_state=42)[config.TARGET[0]]
    pipeline.prep_pipeline.fit(X_train, y_train)

    # prep input
    prep_train = pipeline.prep_pipeline.transform(X_train)

    # fit
    np.random.seed(42)
    pipeline.estimator_cv.fit(prep_train, y_train)

    full_pipeline = Pipeline([
        ('prep_pipeline', pipeline.prep_pipeline),
        ('estimator', pipeline.estimator_cv.best_estimator_)
    ])

    # print reulsts
    print(f'best estimator: {pipeline.estimator_cv.best_estimator_}')
    print(f'best estimator cv score: {pipeline.estimator_cv.best_score_}')
    print(f'best estimator score: {pipeline.estimator_cv.best_estimator_.score(prep_train, y_train)}')
    print(f'full pipe score: {full_pipeline.score(X_train, y_train)}')

    _logger.info(f"save model version: {_version}")
    save_pipeline(pipeline=full_pipeline)
예제 #2
0
def test_predict_proba():

    file_name = 'test.csv'
    data = load_dataset(file_name=file_name)
    test_json = data[0:1].to_json(orient='records')
    test_results = predict.make_prediction_proba(input_data=test_json)

    assert test_results is not None
    assert len(test_results['predictions'][0]) == 2
    assert test_results['predictions'][0][0] <= 1
    assert test_results['predictions'][0][0] >= 0
예제 #3
0
def test_predict():

    file_name = 'test.csv'
    data = load_dataset(file_name=file_name)
    test_json = data[0:1].to_json(orient='records')
    test_results = predict.make_prediction(input_data=test_json)

    assert test_results is not None
    assert isinstance(test_results['predictions'][0], int)
    assert test_results['predictions'][0] <= 1
    assert test_results['predictions'][0] >= 0
예제 #4
0
def test_prediction_endpoint(flask_test_client):
    data = load_dataset(file_name=config.TEST_FILENAME)
    post_json = data[0:1].to_json(orient='records')
    _logger.info(f'POST JSON in test prediction endpoint: {post_json}')
    _logger.info(
        f'type of POST JSON in test prediction endpoint: {type(post_json)}')
    response = flask_test_client.post('/v1/predict/lasso', json=post_json)

    assert response.status_code == 200
    response_json = json.loads(response.data)
    response_pred = response_json.get('predictions')
    response_version = response_json.get('version')
    assert response_pred[0] <= 1 and response_pred[0] >= 0
    assert response_version == model_version
예제 #5
0
def make_prediction(*, input_data: json) -> dict:
    data = pd.read_json(input_data)
    data = validate_data(data)

    pred = pipeline.predict(data[config.FEATURES])
    _logger.info(f"Making prediction with model version: {_version} "
                 f"Inputs: {data} "
                 f"Predictions: {pred}")

    response = {'predictions': pred.tolist(), "version": _version}
    return response


def make_prediction_proba(*, input_data) -> dict:
    data = pd.read_json(input_data)
    pred = pipeline.predict_proba(data[config.FEATURES])
    response = {'predictions': pred.tolist(), "version": _version}
    return response


if __name__ == '__main__':

    data = load_dataset(file_name=config.TRAIN_DATA_FN)

    X_train = data[config.TRAIN_FEATURES].sample(
        frac=1, random_state=42).drop(columns=config.TARGET)
    y_train = data[config.TRAIN_FEATURES].sample(
        frac=1, random_state=42)[config.TARGET[0]]

    # load model
    print(pipeline.score(X_train, y_train))
예제 #6
0
import json

import pandas as pd
import numpy as np

from lasso.predict import make_prediction, make_prediction_proba
from lasso import config
from lasso.preprocess.data_management import load_dataset
from myapi import config as api_config
from myapi.logger_config import get_logger
from myapi.config import PREV_VER_PREDS_FILENAME

test_dataset = load_dataset(file_name=api_config.TEST_FILENAME)
_logger = get_logger(logger_name=__name__)


def capture_prev_ver_predictions():
    df = test_dataset[111:444].copy()
    test_json = df.to_json(orient='records')
    test_preds = make_prediction_proba(input_data=test_json)

    test_pred = pd.DataFrame(test_preds['predictions'])
    test_pred_version = test_preds['version']

    test_pred.to_csv(api_config.PACKAGE_ROOT /
                     api_config.PREV_VER_PREDS_FILENAME,
                     index=False)
    _logger.debug(f'save previous version {test_pred_version} \
        predictions: {api_config.PACKAGE_ROOT / api_config.PREV_VER_PREDS_FILENAME}'
                  )
예제 #7
0
import json

import pandas as pd
import numpy as np

import pytest

from tests.conftest import flask_test_client
from myapi.config import PACKAGE_ROOT, PREV_VER_PREDS_FILENAME, ACCETPTABLE_DIFF
from myapi.logger_config import get_logger
from lasso import config
from lasso.predict import make_prediction_proba
from lasso.preprocess.data_management import load_dataset

test_dataset = load_dataset(file_name=config.TEST_DATA_FN)
_logger = get_logger(logger_name=__name__)


@pytest.mark.differential
def test_differential(flask_test_client):
    prev_ver_test_pred = pd.read_csv(PACKAGE_ROOT / PREV_VER_PREDS_FILENAME)
    _logger.info('load previous version test predictions')

    df = test_dataset[111:444].copy()
    test_json = df.to_json(orient='records')
    _logger.info(f'Test differential Inputs: {test_json}')
    
    response = flask_test_client.post('/v1/predict_proba/lasso',
    json = test_json)
    pred_json = response.data
예제 #8
0
    return validated_data


def validate_inputs(*, input_data: json):

    data_schema = DataSchema(many=True)
    input_dic = json.loads(input_data)

    errors = None
    try:
        results = data_schema.load(input_dic)
    except ValidationError as exc:
        errors = exc.messages

    # filter input
    if errors:
        validated_data = _filter_error_inputs(errors, input_dic)
        validated_data = json.dumps(validated_data)
        return validated_data, errors

    return input_data, errors


if __name__ == '__main__':
    data = load_dataset(file_name=config.TEST_FILENAME)
    post_json = data[111:444].to_json(orient='records')
    val_data, err = validate_inputs(input_data=post_json)
    val_df = pd.read_json(val_data)
    print(val_data)