def test_treefoo_duh(self): bundle, datasets, stations_df = bltu.make_basic_minimal_model() holdout_df = datasets['holdout_df'] y_predictions, y_test, metrics = blc.run_model_predict(bundle, holdout_df, stations_df, labeled=True) proportion_correct_labeled_true = bltu.get_basic_proportion_correct( y_test, y_predictions) # Again but unlabeled now. contracted_df = blc.contract_df(holdout_df) widened_df = blc.widen_df_with_other_cols(contracted_df, s.ALL_COLUMNS) y_predictions_from_widened, _, _ = blc.run_model_predict(bundle, widened_df, stations_df, labeled=False) proportion_correct_labeled_false = bltu.get_basic_proportion_correct( y_test, y_predictions_from_widened) # and assert, evaluation should be the same.. assert proportion_correct_labeled_false == proportion_correct_labeled_true pass
def test_basic(self): csvdata = '10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1' df = blc.hydrate_and_widen(csvdata) bundle, datasets, stations_df = bltu.make_basic_minimal_model() y_predictions, _, _ = blc.run_model_predict(bundle, df, stations_df, labeled=False) pass
def test_treefoo_with_pure_input_data(self): # csvdata = 'starttime,start station name,usertype,birth year,gender\n10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1' csvdata = '10/1/2015 00:00:02,W 26 St & 10 Ave,Subscriber,1973,1\n10/1/2015 00:00:02,E 39 St & 2 Ave,Subscriber,1990,1' df = blc.hydrate_csv_to_df(csvdata) bundle, datasets, stations_df = bltu.make_basic_minimal_model() widened_df = blc.widen_df_with_other_cols(df, s.ALL_COLUMNS) y_predictions, _, _ = blc.run_model_predict(bundle, widened_df, stations_df, labeled=False) pass
def test_predict_matches_proba(self): bundle, datasets, stations_df = bltu.make_basic_minimal_model() df = datasets['holdout_df'] clf = bundle['clf'] prepared = blc.predict_prepare(bundle, df, stations_df, labeled=True) y_topk1_outputs = blmu.get_sorted_predict_proba_predictions( prepared['y_predict_proba'], clf.classes_, k=1) y_predictions, y_test, metrics = blc.run_model_predict(bundle, df, stations_df, labeled=True) l = [ x[1] for x in sorted(metrics['rank_k_proba_scores'].items(), key=lambda x: x[0]) ] assert all([l[i] <= l[i + 1] for i in range(len(l) - 1)]) assert [x[0] for x in y_topk1_outputs] == list(y_predictions)
def make_tree_foo(datasets, stations, hyperparameters={}): assert datasets['train_fn'] assert stations['fn'] cols = [ s.NEW_START_POSTAL_CODE, s.NEW_START_BOROUGH, s.NEW_START_NEIGHBORHOOD, s.START_DAY, s.START_HOUR, s.AGE_COL_NAME, s.GENDER, s.USER_TYPE_COL ] feature_encoding_dict = { s.USER_TYPE_COL: str, s.AGE_COL_NAME: float, s.NEW_START_POSTAL_CODE: str, s.NEW_START_BOROUGH: str, s.NEW_START_NEIGHBORHOOD: str, s.NEW_END_NEIGHBORHOOD: str } simpledf, label_encoders = pl.make_simple_df_from_raw( datasets['trainset'], stations['stations_df'], feature_encoding_dict) train_df, validation_df = blc.simple_split( simpledf) # FIXME: label encoders from full or train? X_train = np.array(train_df[cols]) y_train = np.array(train_df[s.NEW_END_NEIGHBORHOOD]) clf = RandomForestClassifier( max_depth=int(hyperparameters.get('max_depth')), n_estimators=int(hyperparameters.get('n_estimators')), random_state=0) clf.fit(X_train, y_train) validation_output = blmu.do_validation(clf, validation_df, cols) bundle = { 'train_metadata': { 'trainset_fn': datasets['train_fn'], 'stations_df_fn': stations['fn'], 'hyperparameters': hyperparameters }, 'timestamp': pl.make_timestamp(), 'clf': clf, 'model_id': 'tree-foo', 'bundle_name': 'tree-foo-bundle-' + get_random_handle(), 'label_encoders': label_encoders, 'features': { 'input': cols, 'output_label': s.NEW_END_NEIGHBORHOOD, 'dtypes': feature_encoding_dict }, 'evaluation': { 'validation_metrics': validation_output }, 'clf_info': { 'feature_importances': zip(cols, clf.feature_importances_) } } # Test... holdout_original_df = datasets['testset'] # FIXME properly handling this? holdout_df = \ blc.widen_df_with_other_cols(holdout_original_df, s.ALL_COLUMNS) _, _, test_output = blc.run_model_predict(bundle, holdout_df, stations['stations_df'], labeled=True) # test_output = blmu.do_validation(clf, holdout_df, cols) # update bundle. bundle['evaluation'].update({'test_metrics': test_output}) bundle.update({'test_metadata': { 'testset_fn': datasets['test_fn'], }}) return bundle