def test_prediction_quality_against_benchmark(): # Given train_data = load_dataset('train.csv') input_df = train_data.drop(config.TARGET, axis=1) output_df = train_data[config.TARGET] benchmark_flexibility = 50000 benchmark_lower_boundary = ( round(output_df.iloc[0], ndigits=-4) - benchmark_flexibility ) # 210000 - 50000 = 160000 benchmark_upper_boundary = ( round(output_df.iloc[0], ndigits=-4) + benchmark_flexibility ) # 210000 + 50000 = 260000 multiple_test_json = input_df.to_json(orient='records') # When subject = make_prediction(input_data=multiple_test_json) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], float) value = math.ceil(subject.get('predictions')[0]) assert value > benchmark_lower_boundary assert value < benchmark_upper_boundary
def test_pipeline_drops_unnecessary_features(): # Given test_data = load_dataset('train.csv') X_train, X_test, y_train, y_test = train_test_split( test_data, test_data[config.TARGET], test_size=0.1, random_state=0) assert len(config.FEATURES) != len(X_train.columns) X_transformed, _ = price_pipe._fit(X_train, y_train) assert len(X_transformed[0]) == len(config.FEATURES)
def test_make_multiple_predictions(): # Given test_data = load_dataset('test.csv') original_data_length = len(test_data) multiple_test_json = test_data.to_json(orient='records') # When subject = make_prediction(input_data=multiple_test_json) # Then assert subject is not None assert len(subject.get('predictions')) != original_data_length
def test_transformer_drops_unnecessary_features(): test_data = load_dataset('train.csv') X_train, X_test, y_train, y_test = train_test_split( test_data, test_data[config.TARGET], test_size=0.1, random_state=0) transformer = pp.KeepColumnsTransformer(variables=config.FEATURES, ) assert len(config.FEATURES) != len(X_train.columns) X_transformed = transformer.transform(X_train) assert len(X_transformed.columns) == len(config.FEATURES)
def test_pipeline_transform_min_max_features(): # Given test_data = load_dataset('train.csv') X_train, X_test, y_train, y_test = train_test_split( test_data, test_data[config.TARGET], test_size=0.1, random_state=0) X_transformed, _ = price_pipe._fit(X_train, y_train) for x in X_transformed: for v in x: assert 0.0 <= v <= 1.0
def test_prediction_endpoint_returns_prediction(flask_test_client): test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:5].to_json(orient='records') # When response = flask_test_client.post('/v1/predict', json=post_json) # Then assert response.status_code == 200 response_json = json.loads(response.data) prediction = response_json['predictions'] assert len(prediction) == 5
def test_make_single_prediction(): # Given test_data = load_dataset('test.csv') single_test_json = test_data[0:1].to_json(orient='records') # When subject = make_prediction(input_data=single_test_json) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], float) print(math.ceil(subject.get('predictions')[0])) assert math.ceil(subject.get('predictions')[0]) == 112964
def run_training(): print("training model") data = load_dataset(file_name=config.TRAINING_DATA_FILE) X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) y_train = np.log(y_train) pipeline.price_pipe.fit(X_train[config.FEATURES], y_train) save_pipeline(pipeline_to_persist=pipeline.price_pipe) print("training finished")