Пример #1
0
    def test_treefoo_duh(self):
        bundle, datasets, stations_df = bltu.make_basic_minimal_model()

        holdout_df = datasets['holdout_df']

        y_predictions, y_test, metrics = blc.run_model_predict(bundle,
                                                               holdout_df,
                                                               stations_df,
                                                               labeled=True)

        proportion_correct_labeled_true = bltu.get_basic_proportion_correct(
            y_test, y_predictions)

        # Again but unlabeled now.

        contracted_df = blc.contract_df(holdout_df)

        widened_df = blc.widen_df_with_other_cols(contracted_df, s.ALL_COLUMNS)

        y_predictions_from_widened, _, _ = blc.run_model_predict(bundle,
                                                                 widened_df,
                                                                 stations_df,
                                                                 labeled=False)

        proportion_correct_labeled_false = bltu.get_basic_proportion_correct(
            y_test, y_predictions_from_widened)

        # and assert, evaluation should be the same..
        assert proportion_correct_labeled_false == proportion_correct_labeled_true
        pass
Пример #2
0
    def test_basic(self):
        csvdata = '10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1'

        df = blc.hydrate_and_widen(csvdata)
        bundle, datasets, stations_df = bltu.make_basic_minimal_model()
        y_predictions, _, _ = blc.run_model_predict(bundle,
                                                    df,
                                                    stations_df,
                                                    labeled=False)
        pass
Пример #3
0
    def test_treefoo_with_pure_input_data(self):
        # csvdata = 'starttime,start station name,usertype,birth year,gender\n10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1'

        csvdata = '10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1'

        df = blc.hydrate_csv_to_df(csvdata)

        bundle, datasets, stations_df = bltu.make_basic_minimal_model()

        widened_df = blc.widen_df_with_other_cols(df, s.ALL_COLUMNS)

        y_predictions, _, _ = blc.run_model_predict(bundle,
                                                    widened_df,
                                                    stations_df,
                                                    labeled=False)

        pass
Пример #4
0
    def test_predict_matches_proba(self):

        bundle, datasets, stations_df = bltu.make_basic_minimal_model()
        df = datasets['holdout_df']
        clf = bundle['clf']

        prepared = blc.predict_prepare(bundle, df, stations_df, labeled=True)

        y_topk1_outputs = blmu.get_sorted_predict_proba_predictions(
            prepared['y_predict_proba'], clf.classes_, k=1)

        y_predictions, y_test, metrics = blc.run_model_predict(bundle,
                                                               df,
                                                               stations_df,
                                                               labeled=True)

        l = [
            x[1] for x in sorted(metrics['rank_k_proba_scores'].items(),
                                 key=lambda x: x[0])
        ]
        assert all([l[i] <= l[i + 1] for i in range(len(l) - 1)])

        assert [x[0] for x in y_topk1_outputs] == list(y_predictions)
Пример #5
0
def make_tree_foo(datasets, stations, hyperparameters={}):
    assert datasets['train_fn']
    assert stations['fn']

    cols = [
        s.NEW_START_POSTAL_CODE, s.NEW_START_BOROUGH, s.NEW_START_NEIGHBORHOOD,
        s.START_DAY, s.START_HOUR, s.AGE_COL_NAME, s.GENDER, s.USER_TYPE_COL
    ]
    feature_encoding_dict = {
        s.USER_TYPE_COL: str,
        s.AGE_COL_NAME: float,
        s.NEW_START_POSTAL_CODE: str,
        s.NEW_START_BOROUGH: str,
        s.NEW_START_NEIGHBORHOOD: str,
        s.NEW_END_NEIGHBORHOOD: str
    }

    simpledf, label_encoders = pl.make_simple_df_from_raw(
        datasets['trainset'], stations['stations_df'], feature_encoding_dict)
    train_df, validation_df = blc.simple_split(
        simpledf)  # FIXME: label encoders from full or train?

    X_train = np.array(train_df[cols])
    y_train = np.array(train_df[s.NEW_END_NEIGHBORHOOD])

    clf = RandomForestClassifier(
        max_depth=int(hyperparameters.get('max_depth')),
        n_estimators=int(hyperparameters.get('n_estimators')),
        random_state=0)
    clf.fit(X_train, y_train)

    validation_output = blmu.do_validation(clf, validation_df, cols)

    bundle = {
        'train_metadata': {
            'trainset_fn': datasets['train_fn'],
            'stations_df_fn': stations['fn'],
            'hyperparameters': hyperparameters
        },
        'timestamp': pl.make_timestamp(),
        'clf': clf,
        'model_id': 'tree-foo',
        'bundle_name': 'tree-foo-bundle-' + get_random_handle(),
        'label_encoders': label_encoders,
        'features': {
            'input': cols,
            'output_label': s.NEW_END_NEIGHBORHOOD,
            'dtypes': feature_encoding_dict
        },
        'evaluation': {
            'validation_metrics': validation_output
        },
        'clf_info': {
            'feature_importances': zip(cols, clf.feature_importances_)
        }
    }

    # Test...
    holdout_original_df = datasets['testset']  # FIXME properly handling this?
    holdout_df = \
            blc.widen_df_with_other_cols(holdout_original_df, s.ALL_COLUMNS)

    _, _, test_output = blc.run_model_predict(bundle,
                                              holdout_df,
                                              stations['stations_df'],
                                              labeled=True)
    # test_output = blmu.do_validation(clf, holdout_df, cols)

    # update bundle.
    bundle['evaluation'].update({'test_metrics': test_output})
    bundle.update({'test_metadata': {
        'testset_fn': datasets['test_fn'],
    }})

    return bundle