def import_dataset(): df = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv")) target = "CAPSULE" target_alt = "RACE" df[target] = df[target].asfactor() df[target_alt] = df[target_alt].asfactor() return pu.ns(train=df, target=target, target_alt=target_alt)
def prepare_data(): fr = h2o.import_file(path=pu.locate("smalldata/junit/weather.csv")) target = "RainTomorrow" fr[target] = fr[target].asfactor() ds = pu.ns(x=fr.columns, y=target, train=fr) return ds
def compute_perf(model): perf = pu.ns(train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test)) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf
def import_dataset(seed=0, mode='binary'): df = h2o.import_file( path=pu.locate("smalldata/titanic/titanic_expanded.csv"), header=1) target = dict(binary='survived', multiclass='pclass', regression='fare')[mode] fr = df.split_frame(ratios=[.8], seed=seed) return pu.ns(train=fr[0], test=fr[1], target=target)
def import_dataset(seed=0, larger=False): df = h2o.import_file(path=pu.locate("smalldata/prostate/{}".format("prostate_complete.csv.zip" if larger else "prostate.csv"))) target = "CAPSULE" df[target] = df[target].asfactor() #Split frames fr = df.split_frame(ratios=[.8,.1], seed=seed) #Set up train, validation, and test sets return pu.ns(train=fr[0], valid=fr[1], test=fr[2], target=target, target_idx=1)
def compute_perf(model): perf = pu.ns( train=model.model_performance(train=True), test=model.model_performance(test_data=ds.test) ) print("{} training performance: ".format(model.model_id)) print(perf.train) print("{} test performance: ".format(model.model_id)) print(perf.test) return perf
def import_dataset(seed=0): df = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv")) target = "CAPSULE" df[target] = df[target].asfactor() fr = df.split_frame(ratios=[.8, .1], seed=seed) return pu.ns(train=fr[0], valid=fr[1], test=fr[2], target=target, target_idx=1)
def prepare_data(blending=False): fr = h2o.import_file(path=pu.locate("smalldata/testng/higgs_train_5k.csv")) target = "response" fr[target] = fr[target].asfactor() ds = pu.ns(x=fr.columns, y=target, train=fr) if blending: train, blend = fr.split_frame(ratios=[.7], seed=seed) return ds.extend(train=train, blend=blend) else: return ds
def prepare_data(blending=False): fr = h2o.import_file(path=pu.locate("smalldata/junit/weather.csv")) target = "RainTomorrow" fr[target] = fr[target].asfactor() ds = pu.ns(x=fr.columns, y=target, train=fr) if blending: train, blend = fr.split_frame(ratios=[.7], seed=seed) return ds.extend(train=train, blend=blend) else: return ds
def prepare_data(blending=False): fr = h2o.import_file(path=pu.locate("smalldata/testng/higgs_train_5k.csv")) target = "response" fr[target] = fr[target].asfactor() ds = pu.ns(x=fr.columns, y=target, train=fr) if blending: train, blend = fr.split_frame(ratios=[.7], seed=seed) return ds.extend(train=train, blend=blend) else: return ds
def load_dataset(incl_test=False, incl_foldc=False): fr = h2o.import_file(pu.locate("smalldata/titanic/titanic_expanded.csv"), header=1) target = "pclass" train = fr test = None if incl_test: fr = fr.split_frame(ratios=[.8], destination_frames=["titanic_train", "titanic_test"], seed=seed) train = fr[0] test = fr[1] if incl_foldc: train["foldc"] = train.kfold_column(3, seed) return pu.ns(train=train, test=test, target=target)
def test_base_models_can_use_different_compatible_training_frames(): """ test that passing in base models that use different subsets of the features works. (different training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None, train=ds.train[list(range(1, 11))].cbind(ds.train[ds.y])), drf=ds.extend(x=None, train=ds.train[list(range(13, 20))].cbind(ds.train[ds.y]))) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) assert se.auc() > 0
def test_se_fails_when_base_models_use_incompatible_training_frames(): """ test that SE fails when passing in base models that were trained with frames of different size """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None), drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows//2,:])) bm = train_base_models(datasets) try: train_stacked_ensemble(ds, bm) assert False, "Stacked Ensembles of models with different training frame sizes should fail" except Exception as e: assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(e), "wrong error message: {}".format(str(e))
def test_base_models_can_use_different_x(): """ test that passing in base models that use different subsets of the features works. (different x, but same training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), drf=ds.extend(x=ds.x[13:20])) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) se_nox = train_stacked_ensemble(ds.extend(x=None), bm) assert se.auc() > 0 assert se.auc() == se_nox.auc()
def test_base_models_can_use_different_x(): """ test that passing in base models that use different subsets of the features works. (different x, but same training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), drf=ds.extend(x=ds.x[13:20])) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) se_nox = train_stacked_ensemble(ds.extend(x=None), bm) assert se.auc() > 0 assert se.auc() == se_nox.auc()
def setup_data(): # MNIST is multinomial classification problem train_full = h2o.import_file( pu.locate("bigdata/laptop/mnist/train.csv.gz")) # test_full = h2o.import_file(pu.locate("bigdata/laptop/mnist/test.csv.gz")) # train = train_full # test = test_full train, test, _ = train_full.split_frame(ratios=[.05, .1], seed=seed) x = train.columns[:-1] y = -1 for fr in [train]: fr[y] = fr[y].asfactor() domain = unique(train[y]) print(domain) return pu.ns(x=x, y=y, train=train, test=test, domain=domain)
def prepare_data(blending=False): col_types = ["numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric"] dat = h2o.upload_file(path=pu.locate("smalldata/extdata/prostate.csv"), destination_frame="prostate_hex", col_types=col_types) train, test = dat.split_frame(ratios=[.8], seed=1) x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"] y = "AGE" ds = pu.ns(x=x, y=y, train=train, test=test) if blending: train, blend = train.split_frame(ratios=[.7], seed=seed) return ds.extend(train=train, blend=blend) else: return ds
def test_base_models_can_use_different_compatible_training_frames(): """ test that passing in base models that use different subsets of the features works. (different training_frame) """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None, train=ds.train[list(range( 1, 11))].cbind(ds.train[ds.y])), drf=ds.extend(x=None, train=ds.train[list(range( 13, 20))].cbind(ds.train[ds.y]))) bm = train_base_models(datasets) se = train_stacked_ensemble(ds, bm) assert se.auc() > 0
def test_se_fails_when_base_models_use_incompatible_training_frames(): """ test that SE fails when passing in base models that were trained with frames of different size """ ds = prepare_data(blending) datasets = pu.ns(gbm=ds.extend(x=None), drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows // 2, :])) bm = train_base_models(datasets) try: train_stacked_ensemble(ds, bm) assert False, "Stacked Ensembles of models with different training frame sizes should fail" except Exception as e: assert "Base models are inconsistent: they use different size (number of rows) training frames" in str( e), "wrong error message: {}".format(str(e))
def prepare_data(blending=False): col_types = [ "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ] dat = h2o.upload_file(path=pu.locate("smalldata/extdata/prostate.csv"), destination_frame="prostate_hex", col_types=col_types) train, test = dat.split_frame(ratios=[.8], seed=1) x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"] y = "AGE" ds = pu.ns(x=x, y=y, train=train, test=test) if blending: train, blend = train.split_frame(ratios=[.7], seed=seed) return ds.extend(train=train, blend=blend) else: return ds
def scores_and_preds(models, test): retval = lambda d: d if not isinstance(models, list): models = [models] retval = lambda d: next(iter(d.values())) training_scores = {m.key: m.mean_per_class_error() for m in models} cv_scores = {m.key: m.mean_per_class_error(xval=True) for m in models} test_scores = { m.key: m.model_performance(test).mean_per_class_error() for m in models } test_predictions = {m.key: m.predict(test) for m in models} test_pclasses = { m.key: unique(test_predictions[m.key]['predict']) for m in models } return pu.ns( training_scores=retval(training_scores), cv_scores=retval(cv_scores), test_scores=retval(test_scores), test_pclasses=retval(test_pclasses), )
def import_dataset(): df = h2o.import_file(path=pu.locate("smalldata/extdata/australia.csv")) fr = df.split_frame(ratios=[.8, .1]) target = "runoffnew" return pu.ns(target=target, train=fr[0], valid=fr[1], test=fr[2])