def test_prediction_endpoint_returns_prediction(flask_test_client): # Given # Load the test data from the regression_model package # This is important as it makes it harder for the test # data versions to get confused by not spreading it # across packages. test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:1].to_json(orient='records') print("########################################") print(post_json) # When response = flask_test_client.post('/v1/predict/classification', json=post_json) # Then assert response.status_code == 200 response_json = json.loads(response.data) prediction = response_json['predictions'] response_version = response_json['version'] assert math.ceil(prediction) == 0 assert response_version == _version
def test_model_for_differential(*, save_file='test_data_predictions.csv'): previous_model_df = pd.read_csv(f'{api_config.PACKAGE_ROOT}/{save_file}') previous_model_predictions = previous_model_df.predictions.values print('previous predictions:', previous_model_predictions) test_data = load_dataset(file_name=config.TESTING_DATA_FILE) test_data.drop('id', axis=1, inplace=True) test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = test_data[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) multiple_test_input = test_data[0:200] current_result = make_prediction(input_data=multiple_test_input) current_model_predictions = current_result.get('predictions') print('current predictions:', current_model_predictions) assert len(previous_model_predictions) == len(current_model_predictions) for previous_value, current_value in zip(previous_model_predictions, current_model_predictions): previous_value = previous_value.item() current_value = current_value.item() assert math.isclose(previous_value, current_value, rel_tol=1)
def run_training(): """Train the model.""" # read training data data = load_dataset(config.TRAINING_DATA_FILE) print("------------") # select only the customer conversations data = data[data["message_source"] == "customer"][["message","case_type"]] # divide train and test sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=config.SEED) X = data.drop(config.TARGET, axis=1) y = data[config.TARGET].apply(lambda x: 0 if x=="cancel_order" else 1) for train_index, test_index in sss.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = X_train[config.FEATURES] X_test = X_test[config.FEATURES] print(X_train.iloc[0]) pipeline.full_pipe.fit(X_train, y_train) save_pipeline(pipeline_to_persist=pipeline.full_pipe)
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_input = test_data[0:1] # When subject = make_prediction(input_data=single_test_input) S1 = subject.get('predictions')[0] # Then assert subject is not None assert isinstance(S1, np.int64) assert math.ceil(subject.get('predictions')[0]) == 0
def capture_predictions(*, save_file:str = 'test_data_predictions.csv') : test_data = load_dataset(file_name='test.csv') test_data.drop('id', axis=1, inplace=True) test_data[config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+config.DISCRETE_SET3_FEATURES]=test_data[config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+config.DISCRETE_SET3_FEATURES].astype(str) multiple_test_json = test_data[0:200] predictions = make_prediction(input_data=multiple_test_json) predictions_df = pd.DataFrame(predictions) predictions_df.to_csv(f'{api_config.PACKAGE_ROOT}/{save_file}')
def test_single_prediction(): test_data = load_dataset(file_name=config.TESTING_DATA_FILE) test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = test_data[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) single_test_input = test_data[0:1] subject = make_prediction(input_data=single_test_input[config.FEATURES]) assert subject is not None assert isinstance(subject.get('predictions')[0], np.int64) assert math.ceil(subject.get('predictions')[0] == 0)
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') # print(test_data) single_test_json = test_data[0:1].to_json(orient='records') # print(single_test_json) # When subject = make_prediction(input_data=single_test_json) #print(subject) # Then print(type(subject.get('predictions')[0])) assert subject is not None #assert isinstance(subject.get('predictions')[0], 0) assert math.ceil(subject.get('predictions')[0]) == 0
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_input = test_data # When subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None assert len(subject.get('predictions')) == 417 # We expect some rows to be filtered out assert len(subject.get('predictions')) != original_data_length
def run_training() -> None: """Train the model.""" # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data.drop(config.TARGET, axis=1), data[config.TARGET], test_size=0.2, random_state=config.RANDOM_SEED) # we are setting the seed here # fit pipeline pipeline.titanic_pipe.fit(X_train, y_train) _logger.info(f"saving model version: {_version}") # save pipeline save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_json = test_data.to_json(orient='records') # When subject = make_prediction(input_data=multiple_test_json) # Then assert subject is not None #assert len(subject.get('predictions')) == 1451 # We expect some rows to be filtered out # assert len(subject.get('predictions')) != original_data_length
def test_prediction_endpoint_returns_prediction(flask_test_client): # Given # Load the test data from the classification_model package test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:1].to_json(orient='records') # When response = flask_test_client.post('/v1/predict/classification', json=json.loads(post_json)) # Then assert response.status_code == 200 response_json = json.loads(response.data) prediction = response_json['predictions'] response_version = response_json['version'] assert math.ceil(prediction[0]) == 0 assert response_version == _version
def capture_predictions() -> None: """Save the test data predictions to a CSV.""" save_file = 'test_data_predictions.csv' test_data = load_dataset(file_name='test.csv') # we take a slice with no input validation issues multiple_test_input = test_data[99:600] predictions = make_prediction(input_data=multiple_test_input) # save predictions for the test dataset predictions_df = pd.DataFrame(predictions) # hack here to save the file to the classification model # package of the repo, not the installed package predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_json = test_data[0:1].to_json( orient='records') # Get a single instance # When subject = make_prediction( input_data=single_test_json) #Call the clf to make a prediction # Then assert subject is not None #assert the prediction is not empty assert isinstance( subject.get('predictions')[0], np.int64) #ensure the preduction returns either 0,1 ->int64 assert math.ceil(subject.get('predictions') [0]) == 0 # We now that the first row preidction is 0
def test_prediction_endpoint_validation_200(flask_test_client): # Given # Load the test data from the classification_model package. test_data = load_dataset(file_name=config.TESTING_DATA_FILE) post_json = test_data.to_json(orient='records') # When response = flask_test_client.post('/v1/predict/classification', json=json.loads(post_json)) # Then assert response.status_code == 200 response_json = json.loads(response.data) # Check correct number of errors removed assert len(response_json.get('predictions')) + len( response_json.get('errors')) == len(test_data)
def run_training(): """Train the model.""" # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data.drop(config.TARGET, axis=1), data[config.TARGET], test_size=0.2, random_state=0) # we are setting the seed here pipeline.titanic_pipe.fit(X_train, y_train) joblib.dump(pipeline.titanic_pipe, config.PIPELINE_NAME) _logger.info(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_json = test_data.to_json(orient='records') true_predictions = [0, 1, 0, 1] print("#####################") print(multiple_test_json) # When subject = make_prediction(input_data=multiple_test_json) # Then assert subject is not None assert len(subject.get('predictions')) == 4 for i, pred in enumerate(subject.get('predictions')): print(i) assert pred == true_predictions[i]
def test_prediction_endpoint(flask_test_client): # Given test_data = load_dataset(file_name=config.TESTING_DATA_FILE) test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = test_data[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) post_json = test_data[config.FEATURES].to_json(orient='records') input_data_sent_to_model = json.loads(post_json)[0:500] # When response = flask_test_client.post('/v1/predict/classification', json=input_data_sent_to_model) # Then assert response.status_code == 200 assert len(input_data_sent_to_model) == len( json.loads(response.data)['predictions']) assert json.loads(response.data)['errors'] == None
def test_model_prediction_differential( *, save_file: str = 'test_data_predictions.csv'): """ This test compares the prediction result similarity of the current model with the previous model's results. """ # Given # Load the saved previous model predictions previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}') previous_model_predictions = previous_model_df.predictions.values test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) multiple_test_input = test_data[99:600] # When current_result = make_prediction(input_data=multiple_test_input) current_model_predictions = current_result.get('predictions') # Then # diff the current model vs. the old model assert len(previous_model_predictions) == len( current_model_predictions) # Perform the differential test for previous_value, current_value in zip( previous_model_predictions, current_model_predictions): # convert numpy float64 to Python float. previous_value = previous_value.item() current_value = current_value.item() # rel_tol is the relative tolerance – it is the maximum allowed # difference between a and b, relative to the larger absolute # value of a or b. For example, to set a tolerance of 5%, pass # rel_tol=0.05. assert math.isclose(previous_value, current_value, rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE)
def test_multiple_predictions(): # Given test_data = load_dataset(file_name=config.TESTING_DATA_FILE) test_data.drop('id', axis=1, inplace=True) test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = test_data[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) original_length = len(test_data) multiple_test_input = test_data # When subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None #print(multiple_test_input) #print(original_length) #print(subject) assert len(subject.get('predictions')) == 127037
def run_training() -> None: data = load_dataset(file_name=config.TRAINING_DATA_FILE) X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) X_train[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = X_train[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) X_test[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = X_test[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) pipeline.rf_pipe.fit(X_train[config.FEATURES], y_train) _logger.info(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline.rf_pipe)
def test_prediction_endpoint(flask_test_client): # Given test_data = load_dataset(file_name=config.TESTING_DATA_FILE) test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES] = test_data[ config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES + config.DISCRETE_SET3_FEATURES].astype(str) post_json = test_data[config.FEATURES].to_json(orient='records') input_data_sent_to_model = json.loads(post_json)[0:1] # When response = flask_test_client.post('/v1/predict/classification', json=input_data_sent_to_model) response_json = json.loads(response.data) prediction = response_json['predictions'] response_version = response_json['version'] # Then assert response.status_code == 200 assert math.ceil(prediction[0]) == 0 assert response_version == _version