示例#1
0
def test_saving_basic_ensemble_classifier():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )
    ml_predictor = utils.make_titanic_ensemble(df_titanic_train)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    probas = saved_ml_pipeline.predict_proba(df_titanic_test)
    probas = [proba[1] for proba in probas]
    # print(probas)

    test_score = utils.calculate_brier_score_loss(df_titanic_test.survived,
                                                  probas)
    print('test_score')
    print(test_score)

    # Very rough ensembles don't do as well on this problem as a standard GradientBoostingClassifier does
    # Right now we're getting a score of -.22
    # Make sure our score is good, but not unreasonably good
    assert -0.225 < test_score < -0.17
def test_optimize_final_model_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       calibrate_final_model=True,
                       X_test=df_titanic_calibration,
                       y_test=df_titanic_calibration.survived)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
def test_unmarked_categorical_column_throws_warning():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output'
        # This is the column we are "forgetting" to mark as categorical
        # , 'sex': 'categorical'
        ,
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    with warnings.catch_warnings(record=True) as caught_w:

        ml_predictor.train(df_titanic_train)
        print(
            'we should be throwing a warning for the user to give them useful feedback on the unlabeled categorical column'
        )
        assert len(caught_w) == 1

    ml_predictor.predict(df_titanic_test)

    # We want to make sure the above does not throw an error
    assert True
示例#4
0
def test_verify_features_does_not_work_by_default():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )
    ml_predictor = utils.train_basic_binary_classifier(df_titanic_train)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)
    try:
        keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
        os.remove(keras_file_name)
    except:
        pass

    with warnings.catch_warnings(record=True) as w:

        results = saved_ml_pipeline.named_steps['final_model'].verify_features(
            df_titanic_test)

        print('Here are the caught warnings:')
        print(w)

        assert len(w) == 1

        assert results == None
示例#5
0
def test_linear_model_analytics_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names='RidgeClassifier')

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Linear models aren't super great on this dataset...
    assert -0.21 < test_score < -0.17
def test_include_bad_y_vals_train_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    df_titanic_train.iloc[1]['survived'] = None
    df_titanic_train.iloc[8]['survived'] = None
    df_titanic_train.iloc[26]['survived'] = None

    ml_predictor.train(df_titanic_train)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'),
                                    df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.17 < test_score < -0.135
def test_list_of_single_model_name_classification():
    np.random.seed(0)
    model_name = 'GradientBoostingClassifier'

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, model_names=[model_name])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.16 < test_score < -0.135
def test_unexpected_datetime_column_handled_without_errors():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train)

    test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0]

    test_dict['unexpected_column'] = datetime.date.today()
    test_dict['anoter_unexpected_column'] = datetime.datetime.today()

    ml_predictor.predict(test_dict)

    # We want to make sure the above does not throw an error
    assert True
示例#9
0
def test_verify_features_does_not_work_by_default():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       perform_feature_scaling=False,
                       model_names=['DeepLearningClassifier'])

    file_name = ml_predictor.save(str(random.random()))

    saved_ml_pipeline = utils_models.load_keras_model(file_name)
    os.remove(file_name)

    test_score = saved_ml_pipeline.score(df_titanic_test,
                                         df_titanic_test.survived)

    assert -0.25 < test_score < -0.17
示例#10
0
def test_pass_in_list_of_dictionaries_predict_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    list_titanic_train = df_titanic_train.to_dict('records')

    ml_predictor.train(df_titanic_train, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'),
                                    df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
示例#11
0
def ensemble_classifier_basic_test(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        'survived': 'output'
        , 'sex': 'categorical'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ensemble_config = [
        {
            'model_name': 'LGBMClassifier'
        }
        , {
            'model_name': 'RandomForestClassifier'
        }

    ]

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)


    ml_predictor.train(df_titanic_train, ensemble_config=ensemble_config)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.15 < test_score < -0.131
def test_categorical_ensemble_basic_classifier():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'pclass': 'categorical',
        'embarked': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_titanic_train,
                                            categorical_column='pclass',
                                            optimize_final_model=False)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    assert -0.226 < test_score < -0.17
示例#13
0
def test_throws_warning_when_fl_data_equals_df_train():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    with warnings.catch_warnings(record=True) as w:

        try:
            ml_predictor.train(df_titanic_train,
                               feature_learning=True,
                               fl_data=df_titanic_train)
        except KeyError as e:
            pass
        # We should not be getting to this line- we should be throwing an error above
        for thing in w:
            print(thing)
        assert len(w) >= 1
    assert True
示例#14
0
def test_perform_feature_scaling_true_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       perform_feature_scaling=True,
                       model_names=['LGBMClassifier'])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
示例#15
0
def test_perform_feature_scaling_true_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       perform_feature_scaling=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.215
    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.235

    assert lower_bound < test_score < -0.17
def test_select_from_multiple_classification_models_using_X_test_and_y_test():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       model_names=[
                           'LogisticRegression', 'RandomForestClassifier',
                           'RidgeClassifier', 'GradientBoostingClassifier',
                           'ExtraTreesClassifier', 'AdaBoostClassifier',
                           'SGDClassifier', 'Perceptron',
                           'PassiveAggressiveClassifier'
                       ],
                       X_test=df_titanic_test,
                       y_test=df_titanic_test.survived)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
def test_X_test_and_y_test_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       X_test=df_titanic_test,
                       y_test=df_titanic_test.survived)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
示例#18
0
def test_optimize_entire_pipeline_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       optimize_entire_pipeline=True,
                       model_names=['DeepLearningClassifier'])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.25 < test_score < -0.17
示例#19
0
def test_include_bad_y_vals_predict_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    df_titanic_test.ix[1, 'survived'] = float('nan')
    df_titanic_test.ix[8, 'survived'] = float('inf')
    df_titanic_test.ix[26, 'survived'] = None

    ml_predictor.train(df_titanic_train, model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test.to_dict('records'),
                                    df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
示例#20
0
def categorical_ensembling_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train_categorical_ensemble(df_titanic_train,
                                            model_names=model_name,
                                            categorical_column='embarked')

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    lower_bound = -0.18

    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.215
    if model_name == 'CatBoostClassifier':
        lower_bound = -0.25

    assert lower_bound < test_score < -0.145
示例#21
0
def test_all_algos_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(
        df_titanic_train,
        model_names=[
            'LogisticRegression', 'RandomForestClassifier', 'RidgeClassifier',
            'GradientBoostingClassifier', 'ExtraTreesClassifier',
            'AdaBoostClassifier', 'SGDClassifier', 'Perceptron',
            'PassiveAggressiveClassifier', 'DeepLearningClassifier',
            'XGBClassifier', 'LGBMClassifier', 'LinearSVC'
        ])

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Linear models aren't super great on this dataset...
    assert -0.215 < test_score < -0.131
示例#22
0
def optimize_final_model_classification(model_name=None):
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train,
                       optimize_final_model=True,
                       model_names=model_name)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    # Small sample sizes mean there's a fair bit of noise here
    lower_bound = -0.215

    if model_name == 'DeepLearningClassifier':
        lower_bound = -0.25

    assert lower_bound < test_score < -0.17
示例#23
0
    def test_compare_all_models_classification(model_name=None):
        np.random.seed(0)

        df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
        )

        column_descriptions = {
            'survived': 'output',
            'embarked': 'categorical',
            'pclass': 'categorical'
        }

        ml_predictor = Predictor(type_of_estimator='classifier',
                                 column_descriptions=column_descriptions)

        ml_predictor.train(df_titanic_train,
                           compare_all_models=True,
                           model_names=model_name)

        test_score = ml_predictor.score(df_titanic_test,
                                        df_titanic_test.survived)

        print('test_score')
        print(test_score)

        assert -0.215 < test_score < -0.17
示例#24
0
def test_verify_features_finds_no_missing_features_when_none_are_missing():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_titanic_train, verify_features=True)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    missing_features = saved_ml_pipeline.named_steps[
        'final_model'].verify_features(df_titanic_test)
    print('missing_features')
    print(missing_features)

    print("len(missing_features['prediction_not_training'])")
    print(len(missing_features['prediction_not_training']))
    print("len(missing_features['training_not_prediction'])")
    print(len(missing_features['training_not_prediction']))
    assert len(missing_features['prediction_not_training']) == 0
    assert len(missing_features['training_not_prediction']) == 0
示例#25
0
def test_already_transformed_X():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test,
                                                               test_size=0.33,
                                                               random_state=42)

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # pass in return_trans_pipeline, and get the trans pipeline
    trans_pipeline = ml_predictor.train(df_titanic_train,
                                        model_names='LogisticRegression',
                                        return_transformation_pipeline=True)

    # get transformed X through transformation_only
    X_train_transformed = ml_predictor.transform_only(df_titanic_train)

    # create a new predictor
    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    # pass in trained trans pipeline, and make sure it works
    ml_predictor.train(df_titanic_train,
                       trained_transformation_pipeline=trans_pipeline)
    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.14 < test_score < -0.12

    # pass in both a trans pipeline and a previously transformed X, and make sure that works
    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(None,
                       trained_transformation_pipeline=trans_pipeline,
                       transformed_X=X_train_transformed,
                       transformed_y=df_titanic_train.survived)
    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.14 < test_score < -0.12
示例#26
0
def test_binary_classification():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )
    ml_predictor = utils.train_basic_binary_classifier(df_titanic_train)

    test_score = ml_predictor.score(df_titanic_test,
                                    df_titanic_test.survived,
                                    verbose=0)
    # Right now we're getting a score of -.205
    # Make sure our score is good, but not unreasonably good
    assert -0.215 < test_score < -0.17
示例#27
0
def test_bad_val_for_type_of_estimator():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        # 'survived': 'output'
        'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='invalid_type_of_estimator', column_descriptions=column_descriptions)
示例#28
0
def test_basic_ensemble_classifier():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )
    ml_predictor = utils.make_titanic_ensemble(df_titanic_train)

    test_score = ml_predictor.score(df_titanic_test,
                                    df_titanic_test.survived,
                                    verbose=0)
    # Very rough ensembles don't do as well on this problem as a standard GradientBoostingClassifier does
    # Right now we're getting a score of -.22
    # Make sure our score is good, but not unreasonably good
    assert -0.225 < test_score < -0.17
示例#29
0
def test_missing_output_col_in_column_descriptions():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    column_descriptions = {
        # 'survived': 'output'
        'embarked': 'categorical'
        , 'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)
def test_binary_classification_predict_on_Predictor_instance():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )
    ml_predictor = utils.train_basic_binary_classifier(df_titanic_train)

    predictions = ml_predictor.predict(df_titanic_test)
    test_score = accuracy_score(predictions, df_titanic_test.survived)
    # Make sure our score is good, but not unreasonably good
    print(test_score)
    assert .77 < test_score < .805