Пример #1
0
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_input = test_data[0:1]

    # When
    subject = make_prediction(input_data=single_test_input)

    # Then
    assert subject is not None
    assert isinstance(subject.get('predictions')[0], int64)
    assert subject.get('predictions')[0] == 0
Пример #2
0
def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_input = test_data

    # When
    subject = make_prediction(input_data=multiple_test_input)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == 1649
    assert subject.get('predictions').sum() == 1213

    # We expect some rows to be filtered out
    assert len(subject.get('predictions')) != original_data_length
Пример #3
0
def capture_predictions() -> None:
    """Save the test data predictions to a CSV."""

    save_file = 'test_data_predictions.csv'
    test_data = load_dataset(file_name='test.csv')

    # we take a slice with no input validation issues
    multiple_test_input = test_data[99:600]

    predictions = make_prediction(input_data=multiple_test_input)

    # save predictions for the test dataset
    predictions_df = pd.DataFrame(predictions)

    # hack here to save the file to the regression model
    # package of the repo, not the installed package
    predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')
Пример #4
0
def run_training() -> None:
    """Train the model."""

    # read training data
    data = load_dataset(file_name=config.DATA_FILE)

    # replace variables with dictionary
    for feature in config.DICTIONARY_REPLACER.keys():
        data[feature] = data[feature].replace(
                                    config.DICTIONARY_REPLACER[feature]["target"],
                                    config.DICTIONARY_REPLACER[feature]["replace_value"])

    # remove zeroes and negatives
    ver = ~(data[config.NUMERICALS_LOG_VARS] <= 0).any(axis = 1)
    data = data.loc[ver]

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0
    )  # we are setting the seed here
    print(X_train.shape, X_test.shape)

    # save training and testing
    save_dataset(file_name=config.TRAINING_DATA_FILE, df=X_train)
    save_dataset(file_name=config.TESTING_DATA_FILE, df=X_test)

    # transform the target
    #y_train = np.log(y_train)

    pipeline.mora_pipe.fit(X_train[config.FEATURES], y_train)
    Xt = pipeline.mora_transform.fit_transform(X_train[config.FEATURES], y_train)
    print('XT',Xt.shape)
    cat_tot = Xt[config.CATEGORICAL_VARS].sum().sum()
    num_tot = np.round(Xt[config.NUMERICALS_LOG_VARS].sum().sum(),2)
    dat_tot = Xt['cl_unq_act_act_fnacimiento_date'].sum()
    print(cat_tot, num_tot, dat_tot, cat_tot+num_tot+dat_tot)
    assert cat_tot == 736197
    assert num_tot == 658249.58
    assert dat_tot == 523532
    assert cat_tot+num_tot+dat_tot == 1917978.58
    pred = pipeline.mora_pipe.predict(X_train[config.FEATURES])
    print("OUTPUT PRED SUM",pred.sum())

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.mora_pipe)
Пример #5
0
def test_prediction_endpoint_validation_200(flask_test_client):
    # Given
    # Load the test data from the model package.
    # This is important as it makes it harder for the test
    # data versions to get confused by not spreading it
    # across packages.
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    post_json = test_data.to_json(orient='records')

    # When
    response = flask_test_client.post('/v1/predict/regression',
                                      json=json.loads(post_json))

    # Then
    assert response.status_code == 200
    response_json = json.loads(response.data)

    # Check correct number of errors removed
    assert len(response_json.get('predictions')) + len(
        response_json.get('errors')) == len(test_data)
Пример #6
0
def test_prediction_endpoint_returns_prediction(flask_test_client):
    # Given
    # Load the test data from the model package
    # This is important as it makes it harder for the test
    # data versions to get confused by not spreading it
    # across packages.
    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    post_json = test_data[0:1].to_json(orient='records')
    print(json.loads(post_json))

    # When
    response = flask_test_client.post('/v1/predict/regression',
                                      json=json.loads(post_json))

    # Then
    assert response.status_code == 200
    response_json = json.loads(response.data)
    prediction = response_json['predictions']
    response_version = response_json['version']
    assert prediction[0] == 0
    assert response_version == _version