Exemplo n.º 1
0
def import_dataset():
    df = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    target_alt = "RACE"
    df[target] = df[target].asfactor()
    df[target_alt] = df[target_alt].asfactor()
    return pu.ns(train=df, target=target, target_alt=target_alt)
def prepare_data():
    fr = h2o.import_file(path=pu.locate("smalldata/junit/weather.csv"))
    target = "RainTomorrow"
    fr[target] = fr[target].asfactor()
    ds = pu.ns(x=fr.columns, y=target, train=fr)

    return ds
 def compute_perf(model):
     perf = pu.ns(train=model.model_performance(train=True),
                  test=model.model_performance(test_data=ds.test))
     print("{} training performance: ".format(model.model_id))
     print(perf.train)
     print("{} test performance: ".format(model.model_id))
     print(perf.test)
     return perf
def import_dataset(seed=0, mode='binary'):
    df = h2o.import_file(
        path=pu.locate("smalldata/titanic/titanic_expanded.csv"), header=1)
    target = dict(binary='survived', multiclass='pclass',
                  regression='fare')[mode]

    fr = df.split_frame(ratios=[.8], seed=seed)
    return pu.ns(train=fr[0], test=fr[1], target=target)
def import_dataset(seed=0, larger=False):
    df = h2o.import_file(path=pu.locate("smalldata/prostate/{}".format("prostate_complete.csv.zip" if larger else "prostate.csv")))
    target = "CAPSULE"
    df[target] = df[target].asfactor()
    #Split frames
    fr = df.split_frame(ratios=[.8,.1], seed=seed)
    #Set up train, validation, and test sets
    return pu.ns(train=fr[0], valid=fr[1], test=fr[2], target=target, target_idx=1)
 def compute_perf(model):
     perf = pu.ns(
         train=model.model_performance(train=True),
         test=model.model_performance(test_data=ds.test)
     )
     print("{} training performance: ".format(model.model_id))
     print(perf.train)
     print("{} test performance: ".format(model.model_id))
     print(perf.test)
     return perf
Exemplo n.º 7
0
def import_dataset(seed=0):
    df = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    df[target] = df[target].asfactor()
    fr = df.split_frame(ratios=[.8, .1], seed=seed)
    return pu.ns(train=fr[0],
                 valid=fr[1],
                 test=fr[2],
                 target=target,
                 target_idx=1)
def prepare_data(blending=False):
    fr = h2o.import_file(path=pu.locate("smalldata/testng/higgs_train_5k.csv"))
    target = "response"
    fr[target] = fr[target].asfactor()
    ds = pu.ns(x=fr.columns, y=target, train=fr)

    if blending:
        train, blend = fr.split_frame(ratios=[.7], seed=seed)
        return ds.extend(train=train, blend=blend)
    else:
        return ds
def prepare_data(blending=False):
    fr = h2o.import_file(path=pu.locate("smalldata/junit/weather.csv"))
    target = "RainTomorrow"
    fr[target] = fr[target].asfactor()
    ds = pu.ns(x=fr.columns, y=target, train=fr)

    if blending:
        train, blend = fr.split_frame(ratios=[.7], seed=seed)
        return ds.extend(train=train, blend=blend)
    else:
        return ds
Exemplo n.º 10
0
def prepare_data(blending=False):
    fr = h2o.import_file(path=pu.locate("smalldata/testng/higgs_train_5k.csv"))
    target = "response"
    fr[target] = fr[target].asfactor()
    ds = pu.ns(x=fr.columns, y=target, train=fr)

    if blending:
        train, blend = fr.split_frame(ratios=[.7], seed=seed)
        return ds.extend(train=train, blend=blend)
    else:
        return ds
def load_dataset(incl_test=False, incl_foldc=False):
    fr = h2o.import_file(pu.locate("smalldata/titanic/titanic_expanded.csv"), header=1)
    target = "pclass"
    train = fr
    test = None
    if incl_test:
        fr = fr.split_frame(ratios=[.8], destination_frames=["titanic_train", "titanic_test"], seed=seed)
        train = fr[0]
        test = fr[1]
    if incl_foldc:
        train["foldc"] = train.kfold_column(3, seed)
    return pu.ns(train=train, test=test, target=target)
 def test_base_models_can_use_different_compatible_training_frames():
     """
     test that passing in base models that use different subsets of 
     the features works. (different training_frame) 
     """
     ds = prepare_data(blending)
     datasets = pu.ns(gbm=ds.extend(x=None, 
                                    train=ds.train[list(range(1, 11))].cbind(ds.train[ds.y])), 
                      drf=ds.extend(x=None,
                                    train=ds.train[list(range(13, 20))].cbind(ds.train[ds.y])))
     bm = train_base_models(datasets)
     se = train_stacked_ensemble(ds, bm)
     assert se.auc() > 0
 def test_se_fails_when_base_models_use_incompatible_training_frames():
     """
     test that SE fails when passing in base models that were trained with frames of different size 
     """
     ds = prepare_data(blending)
     datasets = pu.ns(gbm=ds.extend(x=None),
                      drf=ds.extend(x=None, train=ds.train[0:ds.train.nrows//2,:]))
     bm = train_base_models(datasets)
     try:
         train_stacked_ensemble(ds, bm)
         assert False, "Stacked Ensembles of models with different training frame sizes should fail"
     except Exception as e:
         assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(e), "wrong error message: {}".format(str(e))
 def test_base_models_can_use_different_x():
     """
     test that passing in base models that use different subsets of 
     the features works. (different x, but same training_frame)
     """
     ds = prepare_data(blending)
     datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]), 
                      drf=ds.extend(x=ds.x[13:20]))
     
     bm = train_base_models(datasets)
     se = train_stacked_ensemble(ds, bm)
     se_nox = train_stacked_ensemble(ds.extend(x=None), bm)
     assert se.auc() > 0
     assert se.auc() == se_nox.auc()
Exemplo n.º 15
0
    def test_base_models_can_use_different_x():
        """
        test that passing in base models that use different subsets of 
        the features works. (different x, but same training_frame)
        """
        ds = prepare_data(blending)
        datasets = pu.ns(gbm=ds.extend(x=ds.x[1:11]),
                         drf=ds.extend(x=ds.x[13:20]))

        bm = train_base_models(datasets)
        se = train_stacked_ensemble(ds, bm)
        se_nox = train_stacked_ensemble(ds.extend(x=None), bm)
        assert se.auc() > 0
        assert se.auc() == se_nox.auc()
 def setup_data():
     # MNIST is multinomial classification problem
     train_full = h2o.import_file(
         pu.locate("bigdata/laptop/mnist/train.csv.gz"))
     # test_full = h2o.import_file(pu.locate("bigdata/laptop/mnist/test.csv.gz"))
     # train = train_full
     # test = test_full
     train, test, _ = train_full.split_frame(ratios=[.05, .1], seed=seed)
     x = train.columns[:-1]
     y = -1
     for fr in [train]:
         fr[y] = fr[y].asfactor()
     domain = unique(train[y])
     print(domain)
     return pu.ns(x=x, y=y, train=train, test=test, domain=domain)
def prepare_data(blending=False):
    col_types = ["numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric"]
    dat = h2o.upload_file(path=pu.locate("smalldata/extdata/prostate.csv"),
                          destination_frame="prostate_hex",
                          col_types=col_types)
    train, test = dat.split_frame(ratios=[.8], seed=1)
    x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"]
    y = "AGE"
    ds = pu.ns(x=x, y=y, train=train, test=test)

    if blending:
        train, blend = train.split_frame(ratios=[.7], seed=seed)
        return ds.extend(train=train, blend=blend)
    else:
        return ds
Exemplo n.º 18
0
 def test_base_models_can_use_different_compatible_training_frames():
     """
     test that passing in base models that use different subsets of 
     the features works. (different training_frame) 
     """
     ds = prepare_data(blending)
     datasets = pu.ns(gbm=ds.extend(x=None,
                                    train=ds.train[list(range(
                                        1, 11))].cbind(ds.train[ds.y])),
                      drf=ds.extend(x=None,
                                    train=ds.train[list(range(
                                        13, 20))].cbind(ds.train[ds.y])))
     bm = train_base_models(datasets)
     se = train_stacked_ensemble(ds, bm)
     assert se.auc() > 0
Exemplo n.º 19
0
 def test_se_fails_when_base_models_use_incompatible_training_frames():
     """
     test that SE fails when passing in base models that were trained with frames of different size 
     """
     ds = prepare_data(blending)
     datasets = pu.ns(gbm=ds.extend(x=None),
                      drf=ds.extend(x=None,
                                    train=ds.train[0:ds.train.nrows //
                                                   2, :]))
     bm = train_base_models(datasets)
     try:
         train_stacked_ensemble(ds, bm)
         assert False, "Stacked Ensembles of models with different training frame sizes should fail"
     except Exception as e:
         assert "Base models are inconsistent: they use different size (number of rows) training frames" in str(
             e), "wrong error message: {}".format(str(e))
def prepare_data(blending=False):
    col_types = [
        "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric",
        "numeric", "numeric"
    ]
    dat = h2o.upload_file(path=pu.locate("smalldata/extdata/prostate.csv"),
                          destination_frame="prostate_hex",
                          col_types=col_types)
    train, test = dat.split_frame(ratios=[.8], seed=1)
    x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"]
    y = "AGE"
    ds = pu.ns(x=x, y=y, train=train, test=test)

    if blending:
        train, blend = train.split_frame(ratios=[.7], seed=seed)
        return ds.extend(train=train, blend=blend)
    else:
        return ds
 def scores_and_preds(models, test):
     retval = lambda d: d
     if not isinstance(models, list):
         models = [models]
         retval = lambda d: next(iter(d.values()))
     training_scores = {m.key: m.mean_per_class_error() for m in models}
     cv_scores = {m.key: m.mean_per_class_error(xval=True) for m in models}
     test_scores = {
         m.key: m.model_performance(test).mean_per_class_error()
         for m in models
     }
     test_predictions = {m.key: m.predict(test) for m in models}
     test_pclasses = {
         m.key: unique(test_predictions[m.key]['predict'])
         for m in models
     }
     return pu.ns(
         training_scores=retval(training_scores),
         cv_scores=retval(cv_scores),
         test_scores=retval(test_scores),
         test_pclasses=retval(test_pclasses),
     )
Exemplo n.º 22
0
def import_dataset():
    df = h2o.import_file(path=pu.locate("smalldata/extdata/australia.csv"))
    fr = df.split_frame(ratios=[.8, .1])
    target = "runoffnew"
    return pu.ns(target=target, train=fr[0], valid=fr[1], test=fr[2])