Пример #1
0
def test_score_uncertainty():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train,
                                                         test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       perform_feature_selection=True,
                       train_uncertainty_model=True,
                       uncertainty_data=uncertainty_data)

    uncertainty_score = ml_predictor.score_uncertainty(df_boston_test,
                                                       df_boston_test.MEDV)

    print('uncertainty_score')
    print(uncertainty_score)

    assert uncertainty_score > -0.2
def test_prediction_intervals_actually_work():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, predict_intervals=[0.05, 0.95])

    df_boston_test = df_boston_test.reset_index(drop=True)
    intervals = ml_predictor.predict_intervals(df_boston_test)
    actuals = df_boston_test.MEDV

    count_under = 0
    count_over = 0
    # print(intervals)
    for idx, row in intervals.iterrows():
        actual = actuals.iloc[idx]

        if actual < row['interval_0.05']:
            count_under += 1
        if actual > row['interval_0.95']:
            count_over += 1

    len_intervals = len(intervals)

    pct_under = count_under * 1.0 / len_intervals
    pct_over = count_over * 1.0 / len_intervals
    # There's a decent bit of noise since this is such a small dataset
    assert pct_under < 0.2
    assert pct_over < 0.1
def test_perform_feature_selection_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       perform_feature_selection=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # Bumping this up since without these features our score drops
    lower_bound = -4.0
    if model_name == 'DeepLearningRegressor':
        lower_bound = -14.5
    if model_name == 'LGBMRegressor':
        lower_bound = -4.95

    assert lower_bound < test_score < -2.8
def test_all_algos_regression():
    # a random seed of 42 has ExtraTreesRegressor getting the best CV score,
    # and that model doesn't generalize as well as GradientBoostingRegressor.
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        model_names=[
            'LinearRegression', 'RandomForestRegressor', 'Ridge', 'GradientBoostingRegressor',
            'AdaBoostRegressor', 'SGDRegressor', 'PassiveAggressiveRegressor', 'Lasso', 'LassoLars',
            'ElasticNet', 'OrthogonalMatchingPursuit', 'BayesianRidge', 'ARDRegression',
            'MiniBatchKMeans', 'DeepLearningRegressor', 'LGBMRegressor', 'XGBClassifier',
            'LinearSVR', 'CatBoostRegressor'
        ])

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.4 < test_score < -2.8
Пример #5
0
def test_calibrate_uncertainty():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train,
                                                         test_size=0.5)
    uncertainty_data, uncertainty_calibration_data = train_test_split(
        uncertainty_data, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    uncertainty_calibration_settings = {
        'num_buckets': 3,
        'percentiles': [25, 50, 75]
    }
    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data,
        calibrate_uncertainty=True,
        uncertainty_calibration_settings=uncertainty_calibration_settings,
        uncertainty_calibration_data=uncertainty_calibration_data)

    uncertainty_score = ml_predictor.predict_uncertainty(df_boston_test)

    assert 'percentile_25_delta' in list(uncertainty_score.columns)
    assert 'percentile_50_delta' in list(uncertainty_score.columns)
    assert 'percentile_75_delta' in list(uncertainty_score.columns)
    assert 'bucket_num' in list(uncertainty_score.columns)
def test_predict_uncertainty_true():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, predict_intervals=True)

    intervals = ml_predictor.predict_intervals(df_boston_test)

    assert isinstance(intervals, pd.DataFrame)
    assert intervals.shape[0] == df_boston_test.shape[0]

    result_list = ml_predictor.predict_intervals(df_boston_test,
                                                 return_type='list')

    assert isinstance(result_list, list)
    assert len(result_list) == df_boston_test.shape[0]
    for idx, row in enumerate(result_list):
        assert isinstance(row, list)
        assert len(row) == 3

    singles = df_boston_test.head().to_dict('records')

    for row in singles:
        result = ml_predictor.predict_intervals(row)
        assert isinstance(result, dict)
        assert 'prediction' in result
        assert 'interval_0.05' in result
        assert 'interval_0.95' in result

    for row in singles:
        result = ml_predictor.predict_intervals(row, return_type='list')
        assert isinstance(result, list)
        assert len(result) == 3

    df_intervals = ml_predictor.predict_intervals(df_boston_test,
                                                  return_type='df')
    assert isinstance(df_intervals, pd.DataFrame)

    try:
        ml_predictor.predict_intervals(df_boston_test,
                                       return_type='this will not work')
        assert False
    except ValueError:
        assert True
def test_predict_intervals_should_fail_if_not_trained():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train)

    try:
        intervals = ml_predictor.predict_intervals(df_boston_test)
        assert False
    except ValueError:
        assert True
def test_perform_feature_scaling_true_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_scaling=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.0 < test_score < -2.7
def test_compare_all_models_regression():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, compare_all_models=True)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # ExtraTrees again throws this off
    assert -3.6 < test_score < -2.8
def test_prediction_intervals_lets_the_user_specify_number_of_intervals():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       predict_intervals=True,
                       prediction_intervals=[.2])

    intervals = ml_predictor.predict_intervals(df_boston_test,
                                               return_type='list')

    assert len(intervals[0]) == 2
Пример #11
0
def test_predict_uncertainty_returns_pandas_DataFrame_for_more_than_one_value():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data)

    uncertainties = ml_predictor.predict_uncertainty(df_boston_test)

    assert isinstance(uncertainties, pd.DataFrame)
Пример #12
0
def test_predict_uncertainty_returns_dict_for_one_value():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(
        df_boston_train,
        perform_feature_selection=True,
        train_uncertainty_model=True,
        uncertainty_data=uncertainty_data)

    test_list = df_boston_test.to_dict('records')

    for item in test_list:
        prediction = ml_predictor.predict_uncertainty(item)
        assert isinstance(prediction, dict)
Пример #13
0
def optimize_final_model_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    # We just want to make sure these run, not necessarily make sure that they're super accurate
    # (which takes more time, and is dataset dependent)
    df_boston_train = df_boston_train.sample(frac=0.5)

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    # the random seed gets a score of -3.21 on python 3.5
    # There's a ton of noise here, due to small sample sizes
    lower_bound = -3.4
    if model_name == 'DeepLearningRegressor':
        lower_bound = -24
    if model_name == 'LGBMRegressor':
        lower_bound = -16
    if model_name == 'GradientBoostingRegressor':
        lower_bound = -5.1
    if model_name == 'CatBoostRegressor':
        lower_bound = -4.5
    if model_name == 'XGBRegressor':
        lower_bound = -4.8

    assert lower_bound < test_score < -2.75
Пример #14
0
def test_input_df_unmodified():
    np.random.seed(42)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    df_shape = df_boston_train.shape
    ml_predictor.train(df_boston_train)

    training_shape = df_boston_train.shape
    assert training_shape[0] == df_shape[0]
    assert training_shape[1] == df_shape[1]

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.35 < test_score < -2.8
Пример #15
0
def test_categorical_ensembling_regression(model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_boston_train,
                                            perform_feature_selection=True,
                                            model_names=model_name,
                                            categorical_column='CHAS')

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    lower_bound = -4.2

    assert lower_bound < test_score < -2.8
Пример #16
0
def ensemble_regressor_basic_test():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ensemble_config = [{
        'model_name': 'LGBMRegressor'
    }, {
        'model_name': 'RandomForestRegressor'
    }]

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, ensemble_config=None)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.0 < test_score < -2.8
Пример #17
0
def test_ignores_new_invalid_features():

    # One of the great unintentional features of brainless is that you can pass in new features at prediction time, that weren't present at training time, and they're silently ignored!
    # One edge case here is new features that are strange objects (lists, datetimes, intervals, or anything else that we can't process in our default data processing pipeline). Initially, we just ignored them in dict_vectorizer, but we need to ignore them earlier.
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train)

    file_name = ml_predictor.save(str(random.random()))

    saved_ml_pipeline = load_ml_model(file_name)

    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    df_boston_test_dictionaries = df_boston_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_boston_test_dictionaries:
        if random.random() > 0.9:
            row['totally_new_feature'] = datetime.datetime.now()
            row['really_strange_feature'] = random.random
            row['we_should_really_ignore_this'] = Predictor
            row['pretty_vanilla_ignored_field'] = 8
            row['potentially_confusing_things_here'] = float('nan')
            row['potentially_confusing_things_again'] = float('inf')
            row['this_is_a_list'] = [1, 2, 3, 4, 5]
        predictions.append(saved_ml_pipeline.predict(row))

    print('predictions')
    print(predictions)
    print('predictions[0]')
    print(predictions[0])
    print('type(predictions)')
    print(type(predictions))
    first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good

    lower_bound = -3.0
    assert lower_bound < first_score < -2.7

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_boston_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.1 < duration.total_seconds() / 1.0 < 15

    # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good

    assert lower_bound < second_score < -2.7
def test_predict_intervals_takes_in_custom_intervals():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    # df_boston_train = pd.concat([df_boston_train, df_boston_train, df_boston_train])

    ml_predictor.train(df_boston_train, predict_intervals=[0.4, 0.6])

    custom_intervals = ml_predictor.predict_intervals(df_boston_test,
                                                      return_type='list')

    assert isinstance(custom_intervals, list)

    singles = df_boston_test.head().to_dict('records')

    acceptable_keys = set(['prediction', 'interval_0.4', 'interval_0.6'])
    for row in singles:
        result = ml_predictor.predict_intervals(row)
        assert isinstance(result, dict)
        assert 'prediction' in result
        assert 'interval_0.4' in result
        assert 'interval_0.6' in result
        for key in result.keys():
            assert key in acceptable_keys

    for row in singles:
        result = ml_predictor.predict_intervals(row, return_type='list')
        assert isinstance(result, list)
        assert len(result) == 3

    df_intervals = ml_predictor.predict_intervals(df_boston_test,
                                                  return_type='df')
    assert df_intervals.shape[0] == df_boston_test.shape[0]
    assert isinstance(df_intervals, pd.DataFrame)

    # Now make sure that the interval values are actually different
    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, predict_intervals=True)

    default_intervals = ml_predictor.predict_intervals(df_boston_test,
                                                       return_type='list')

    # This is a super flaky test, because we've got such a small datasize, and we're trying to get distributions from it
    len_intervals = len(custom_intervals)
    num_failures = 0
    for idx, custom_row in enumerate(custom_intervals):
        default_row = default_intervals[idx]

        if custom_row[1] <= default_row[1]:
            num_failures += 1
            print('{} should be higher than {}'.format(custom_row[1],
                                                       default_row[1]))
        if custom_row[2] >= default_row[2]:
            print('{} should be lower than {}'.format(custom_row[1],
                                                      default_row[1]))
            num_failures += 1

    assert num_failures < 0.18 * len_intervals
def test_feature_learning_categorical_ensembling_getting_single_predictions_regression(
        model_name=None):
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    # NOTE: this is bad practice to pass in our same training set as our fl_data set,
    # but we don't have enough data to do it any other way
    df_boston_train, fl_data = train_test_split(df_boston_train, test_size=0.2)
    ml_predictor.train_categorical_ensemble(
        df_boston_train,
        model_names=model_name,
        feature_learning=True,
        fl_data=fl_data,
        categorical_column='CHAS')

    # print('Score on training data')
    # ml_predictor.score(df_boston_train, df_boston_train.MEDV)

    file_name = ml_predictor.save(str(random.random()))

    from cash_ml.utils_models import load_ml_model

    saved_ml_pipeline = load_ml_model(file_name)

    # with open(file_name, 'rb') as read_file:
    #     saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    df_boston_test_dictionaries = df_boston_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good

    lower_bound = -4.5

    assert lower_bound < first_score < -3.4

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_boston_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro,
    # this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.2 < duration.total_seconds() / 1.0 < 15

    # 3. make sure we're not modifying the dictionaries
    # (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good

    assert lower_bound < second_score < -3.4
Пример #20
0
def getting_single_predictions_regressor_test():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'}

    ensemble_config = [{
        'model_name': 'LGBMRegressor'
    }, {
        'model_name': 'RandomForestRegressor'
    }]

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way
    ml_predictor.train(df_boston_train, ensemble_config=ensemble_config)

    test_score = ml_predictor.score(df_boston_test, df_boston_test.MEDV)

    print('test_score')
    print(test_score)

    assert -3.5 < test_score < -2.8

    file_name = ml_predictor.save(str(random.random()))

    saved_ml_pipeline = load_ml_model(file_name)

    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    df_boston_test_dictionaries = df_boston_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good

    lower_bound = -3.5

    assert lower_bound < first_score < -2.8

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_boston_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.2 < duration.total_seconds() / 1.0 < 60

    # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_boston_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good

    assert lower_bound < second_score < -2.8