def checkpoint_new_category_in_predictor():

  sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))

  from h2o.estimators.gbm import H2OGradientBoostingEstimator

  m1 = H2OGradientBoostingEstimator(ntrees=100)
  m1.train(x=[0,1,2,4],y=3, training_frame=sv1)

  m2 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id)
  m2.train([0,1,2,4], y=3, training_frame=sv2)

  # attempt to continue building model, but with an expanded categorical predictor domain.
  # this should fail until we figure out proper behavior
  try:
    m3 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id)
    m3.train(x=[0,1,2,4], y=3, training_frame=vir)
    assert False, "Expected continued model-building to fail with new categories introduced in predictor"
  except EnvironmentError:
    pass

  # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
  predictions = m2.predict(vir)
def shuffling_large():
  print("Reading in Arcene training data for binomial modeling.")
  train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
  train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


  print("Create model on original Arcene dataset.")
  h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create second model on original Arcene dataset.")
  h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create model on shuffled Arcene dataset.")
  h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled)

  print("Assert that number of predictors remaining and their respective coefficients are equal.")

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def checkpoint_new_category_in_predictor():

  sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))
  print("checkpoint_new_category_in_predictor-1")
  m1 = H2ODeepLearningEstimator(epochs=100)
  m1.train(x=[0,1,2,4], y=3, training_frame=sv1)

  m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
  m2.train(x=[0,1,2,4], y=3, training_frame=sv2)
  print("checkpoint_new_category_in_predictor-2")

  # attempt to continue building model, but with an expanded categorical predictor domain.
  # this should fail
  try:
    m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
    m3.train(x=[0,1,2,4], y=3, training_frame=vir)
    assert False, "Expected continued model-building to fail with new categories introduced in predictor"
  except EnvironmentError:
    pass
  
  print("checkpoint_new_category_in_predictor-3")

  # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
  predictions = m2.predict(vir)
  print("checkpoint_new_category_in_predictor-4")
def shuffling_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


    print("Create model on original Arcene dataset.")
    h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Assert that number of predictors remaining and their respective coefficients are equal.")

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
예제 #5
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisTest = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader_bad_cnames.csv"))

    rank = 3
    gx = 0.5
    gy = 0.5
    trans = "STANDARDIZE"
    print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(
        gy) + ", transform = " + trans)
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans)
    glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

    print("Impute original data from XY decomposition")  # and expect warnings
    buffer = StringIO()     # redirect warning messages to string buffer for later analysis
    sys.stderr = buffer

    h2o_pred = glrm_h2o.predict(irisTest)

    warn_phrase = "UserWarning"
    warn_string_of_interest = "missing column"
    sys.stderr = sys.__stderr__     # redirect it back to stdout.
    try:        # for python 2.7
        if len(buffer.buflist) > 0:
            for index in range(len(buffer.buflist)):
                print("*** captured warning message: {0}".format(buffer.buflist[index]))
                assert (warn_phrase in buffer.buflist[index]) and (warn_string_of_interest in buffer.buflist[index])
    except:     # for python 3.
        warns = buffer.getvalue()
        print("*** captured warning message: {0}".format(warns))
        assert (warn_phrase in warns) and (warn_string_of_interest in warns)
def milsong_checkpoint():

  milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
  milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
  distribution = "gaussian"

  # build first model
  ntrees1 = random.sample(range(50,100),1)[0]
  max_depth1 = random.sample(range(2,6),1)[0]
  min_rows1 = random.sample(range(10,16),1)[0]
  print "ntrees model 1: {0}".format(ntrees1)
  print "max_depth model 1: {0}".format(max_depth1)
  print "min_rows model 1: {0}".format(min_rows1)


  model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                        max_depth=max_depth1,
                                        min_rows=min_rows1,
                                        distribution=distribution)
  model1.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  # save the model, then load the model
  path = pyunit_utils.locate("results")

  assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
  model_path = h2o.save_model(model1, path=path, force=True)

  assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
  restored_model = h2o.load_model(model_path)

  # continue building the model
  ntrees2 = ntrees1 + 50
  max_depth2 = max_depth1
  min_rows2 = min_rows1
  print "ntrees model 2: {0}".format(ntrees2)
  print "max_depth model 2: {0}".format(max_depth2)
  print "min_rows model 2: {0}".format(min_rows2)
  model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution,
                                        checkpoint=restored_model.model_id)
  model2.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  model3 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution)

  model3.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)
def pub_444_spaces_in_filenames():

    # tempdir = "smalldata/jira/"
    # if was okay to write to smalldata, it's okay to write to the current directory
    # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox?
    tempdir = "./"
    # make a few files with spaces in the name
    f1 = open(pyunit_utils.locate(tempdir) + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f1.close()

    f2 = open(pyunit_utils.locate(tempdir) + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f2.close()

    f3 = open(pyunit_utils.locate(tempdir) + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + "foo .csv"))
    train_data.show()
    train_data.describe()
    train_data["response"] = train_data["response"].asfactor()
    gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.train(x=list(range(1,train_data.ncol)), y="response", training_frame=train_data)
    gbm.show()

    train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + "b a r .csv"))
    train_data.show()
    train_data.describe()
    train_data["response"] = train_data["response"].asfactor()

    gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.train(x=1, y="response", training_frame=train_data)

    gbm.show()

    train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + " ba z.csv"))
    train_data.show()
    train_data.describe()
    train_data[0]=train_data[0].asfactor()
    gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.train(x=1, y=0, training_frame=train_data)
    gbm.show()

    os.remove(pyunit_utils.locate(tempdir) + "foo .csv")
    os.remove(pyunit_utils.locate(tempdir) + "b a r .csv")
    os.remove(pyunit_utils.locate(tempdir) + " ba z.csv")
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised[0:resp],
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  # slow, turn off for real problems
        seed=1234,
    )

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(
        x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234
    )

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp], 0)
    test_features = test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format(
        cm.cell_values[10][10]
    )
def pub_444_spaces_in_filenames(ip,port):
    
    

    # tempdir = "smalldata/jira/"
    # if was okay to write to smalldata, it's okay to write to the current directory
    # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox?
    tempdir = "./"
    # make a few files with spaces in the name
    f1 = open(h2o.locate(tempdir) + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f1.close()

    f2 = open(h2o.locate(tempdir) + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f2.close()

    f3 = open(h2o.locate(tempdir) + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + "foo .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + "b a r .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate(tempdir + " ba z.csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    os.remove(h2o.locate(tempdir) + "foo .csv")
    os.remove(h2o.locate(tempdir) + "b a r .csv")
    os.remove(h2o.locate(tempdir) + " ba z.csv")
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(0)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.pop(resp)
    # train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    # train_supervised.describe()

    # train autoencoder
    ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[nfeatures], epochs=1, reproducible=True, seed=1234)

    ae_model.train(list(range(resp)), training_frame=train_unsupervised)

    # convert train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    train_supervised_features = train_supervised_features.cbind(train_supervised[resp])

    # Train DRF on extracted feature space
    drf_model = H2ORandomForestEstimator(ntrees=10, min_rows=10, seed=1234)
    drf_model.train(x=list(range(20)), y=train_supervised_features.ncol - 1, training_frame=train_supervised_features)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp], 0)
    test_features = test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 8.8% error +/- 0.001
    # compare to runit_deeplearning_autoencoder_large.py
    assert abs(cm.cell_values[10][10] - 0.0880) < 0.001, "Error. Expected 0.0880, but got {0}".format(
        cm.cell_values[10][10]
    )
def glrm_arrests_miss():
  missing_ratios = np.arange(0.1, 1, 0.1).tolist()

  print("Importing USArrests.csv data and saving for validation...")
  arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
  arrests_full.describe()
  totobs = arrests_full.nrow * arrests_full.ncol
  train_err = [0]*len(missing_ratios)
  valid_err = [0]*len(missing_ratios)

  for i in range(len(missing_ratios)):
    ratio = missing_ratios[i]
    print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio))
    arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests_miss = arrests_miss.insert_missing_values(fraction=ratio)
    arrests_miss.describe()

    print("H2O GLRM with {0}% missing entries".format(100*ratio))
    arrests_glrm = H2OGeneralizedLowRankEstimator(k=4,
                                                  ignore_const_cols=False,
                                                  loss="Quadratic",
                                                  regularization_x="None",
                                                  regularization_y="None",
                                                  init="PlusPlus",
                                                  max_iterations=10,
                                                  min_step_size=1e-6)
    arrests_glrm.train(x=arrests_miss.names,
                       training_frame=arrests_miss,
                       validation_frame=arrests_full)
    arrests_glrm.show()

    # Check imputed data and error metrics
    glrm_obj = arrests_glrm._model_json['output']['objective']
    train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr']
    train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr']
    valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
    valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
    assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj)
    assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero"
    assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero"

    train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
    valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
    assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data"
    assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs)

    train_err[i] = train_numerr
    valid_err[i] = valid_numerr
    # h2o.remove(arrests_glrm._model_json['output']['loading_key']['name'])

  for i in range(len(missing_ratios)):
    print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def javapredict_smallcat():

    # optional parameters
    params = {'epochs':100}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))
    x = [0,1,2,4]
    y = 3

    tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def javapredict_smallcat():

    # optional parameters
    params = {'ntrees':100, 'max_depth':5, 'min_rows':10}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.upload_file(h2o.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(h2o.locate("smalldata/iris/virginica.csv"))
    x = [0,1,2,4]
    y = 3

    tests.javapredict("random_forest", "numeric", train, test, x, y, **params)
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    m1 = h2o.deeplearning(x=sv[[0,1,2,3]], y=sv[4], epochs=100)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
def pub_444_spaces_in_filenames(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # make a few files with spaces in the name
    f1 = open(h2o.locate("smalldata/jira/") + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f1.close()

    f2 = open(h2o.locate("smalldata/jira/") + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f2.close()

    f3 = open(h2o.locate("smalldata/jira/") + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0,1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/foo .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/b a r .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/ ba z.csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.show()

    os.remove(h2o.locate("smalldata/jira/") + "foo .csv")
    os.remove(h2o.locate("smalldata/jira/") + "b a r .csv")
    os.remove(h2o.locate("smalldata/jira/") + " ba z.csv")
def colname_set_basic(ip,port):
    
    

    print "Uploading iris data..."

    no_headers = h2o.upload_file(h2o.locate("smalldata/iris/iris.csv"))
    headers_and = h2o.upload_file(h2o.locate("smalldata/iris/iris_header.csv"))

    print no_headers.names
    print headers_and.names

    no_headers.setNames(headers_and.names)
    assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\
        format(no_headers.names, headers_and.names)
def colname_set_basic():
    
    

    print("Uploading iris data...")

    no_headers = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    headers_and = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_header.csv"))

    print(no_headers.names)
    print(headers_and.names)

    no_headers.set_names(headers_and.names)
    assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\
        format(no_headers.names, headers_and.names)
def glrm_prostate_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print("Importing prostate_cat.csv data and saving for validation...")
    prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8)
    prostate_full.describe()
    totnas = 0
    for i in range(prostate_full.ncol):
        totnas = totnas + prostate_full[i].isna().sum()
    totobs = prostate_full.nrow * prostate_full.ncol - totnas
    
    train_numerr = [0]*len(missing_ratios)
    valid_numerr = [0]*len(missing_ratios)
    train_caterr = [0]*len(missing_ratios)
    valid_caterr = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio))
        prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
        prostate_miss = prostate_miss.insert_missing_values(fraction=ratio)
        prostate_miss.describe()
        
        print("H2O GLRM with {0}% missing entries".format(100*ratio))
        prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6)
        prostate_glrm.show()
        
        # Check imputed data and error metrics
        train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt']
        valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt']
        assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data"
        assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data"
        assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs)
        assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs)

        train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        h2o.remove(prostate_glrm._model_json['output']['representation_name'])
    
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i]))
        
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
예제 #20
0
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234)

    model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)

    model2 = H2ORandomForestEstimator(
        ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234
    )
    model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # build the equivalent of model 2 in one shot
    model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234)
    model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True)
    )
def offsets_and_distributions(ip,port):

    # cars
    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.setNames(["x1"])
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
def glrm_subset():
  acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (['enum'] + ['numeric']*149))
  
  acs_full = acs_orig.drop("ZCTA5")
  acs_model = H2OGeneralizedLowRankEstimator(k = 10,
                                            transform = 'STANDARDIZE',
                                            loss = 'Quadratic',
                                            regularization_x = 'Quadratic',
                                            regularization_y = 'L1',
                                            gamma_x = 0.25,
                                            gamma_y = 0.5,
                                            max_iterations = 1)
  
  acs_model.train(x = acs_full.names, training_frame= acs_full)
  zcta_arch_x = h2o.get_frame(acs_model._model_json['output']['representation_name'])
  print (zcta_arch_x)
  
  acs_zcta_col = acs_orig["ZCTA5"].asfactor()
  
  idx = ((acs_zcta_col == '10065') |   # Manhattan, NY (Upper East Side)\n",
     (acs_zcta_col == '11219') |   # Manhattan, NY (East Harlem)\n",
      (acs_zcta_col == '66753') |   # McCune, KS\n",
     (acs_zcta_col == '84104') |   # Salt Lake City, UT\n",
     (acs_zcta_col == '94086') |   # Sunnyvale, CA\n",
      (acs_zcta_col == '95014'))    # Cupertino, CA\n",
  
  print(zcta_arch_x[idx,[0,1]])
예제 #23
0
def test4():
    df = h2o.upload_file(pyunit_utils.locate("smalldata/jira/pubdev_2020.csv"))
    splits = df.split_frame(ratios=[0.8], destination_frames=["myf0", "myf1"])
    part0 = splits[0]
    assert part0.frame_id == "myf0"
    part1 = splits[1]
    assert part1.frame_id == "myf1"
def link_functions_tweedie_vpow():
  # Load example data from HDtweedie, y = aggregate claim loss
  hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv"))
  y = "y"
  x = list(set(hdf.names) - set(["y"]))

  print("Testing for family: TWEEDIE")
  print("Create models with canonical link: TWEEDIE")
  # Iterate over different variance powers for tweedie
  vpower = [0, 1, 1.5]
  r_dev = [0.7516627, 0.6708826, 0.7733762]
  r_null = [221051.88369951, 32296.29783702, 20229.47425307]
  for ridx, vpow in enumerate(vpower):
    print("Fit h2o.glm:")
    h2ofit = H2OGeneralizedLinearEstimator(family="tweedie",
                                           link="tweedie",
                                           tweedie_variance_power=vpow,
                                           tweedie_link_power=1-vpow,
                                           alpha=0.5,
                                           Lambda=0)
    h2ofit.train(x=x,y=y, training_frame=hdf)

    print("Testing Tweedie variance power: {0}".format(vpow))

    print("Compare model deviances for link function tweedie")
    deviance_h2o_tweedie = old_div(h2ofit.residual_deviance(), h2ofit.null_deviance())

    assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                       "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

    print("compare null and residual deviance between R glm and h2o.glm for tweedie")
    assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                              "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()
    
    print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()
    
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    
    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2])
    def huber(a):
        return a*a/2 if abs(a) <= 1 else abs(a)-0.5
    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:,3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
예제 #26
0
def grid_glrm_iris():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()
  transform_opts = ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]
  k_opts = random.sample(list(range(1,8)),3)
  size_of_hyper_space = len(transform_opts) * len(k_opts)
  hyper_parameters = OrderedDict()
  hyper_parameters["k"] = k_opts
  hyper_parameters["transform"] = transform_opts
  gx = random.uniform(0,1)
  gy = random.uniform(0,1)
  print("H2O GLRM with , gamma_x = " + str(gx) + ", gamma_y = " + str(gy) +\
        ", hyperparameters = " + str(hyper_parameters))

  gs = H2OGridSearch(H2OGeneralizedLowRankEstimator(loss="Quadratic", gamma_x=gx, gamma_y=gy), hyper_params=hyper_parameters)
  gs.train(x=list(range(4)), y=4, training_frame=irisH2O)
  for model in gs:
    assert isinstance(model, H2OGeneralizedLowRankEstimator)
  print(gs.sort_by("mse"))
  #print gs.hit_ratio_table()

  assert len(gs) == size_of_hyper_space
  total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
  for model in gs.models:
      combo = [model.parms['k']['actual_value']] + [model.parms['transform']['actual_value']]
      assert combo in total_grid_space
      total_grid_space.remove(combo)
def offset_bernoulli_cars():
    # Connect to a pre-existing cluster


    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5 for x in range(398)]])
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    gbm = h2o.gbm(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", ntrees=1, max_depth=1, min_rows=1,
                  learn_rate=1, offset_column="x1", training_frame=cars)

    predictions = gbm.predict(cars)

    # Comparison result generated from R's gbm:
    #	gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)),
    #            distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,
    #            train.fraction = 1,bag.fraction = 1)
    #   pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link")
    #   pr = 1/(1+exp(-df$x1 - pr))
    assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \
        format(-0.1041234, gbm._model_json['output']['init_f'])
    assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \
        format(0.577326, predictions[:,2].mean()[0])
    assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \
        format(0.1621461, predictions[:,2].min())
    assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \
        format(0.8506528, predictions[:,2].max())
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns, uuidNames):
    skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns)
    skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns)  # this two frames should be the same
    pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5)

    skipCounter = 0
    typeDict = originalFullFrame.types
    frameNames = originalFullFrame.names
    for cindex in range(len(frameNames)):
        if cindex not in skipped_columns:
            if typeDict[frameNames[cindex]] == u'enum':
                pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex],
                                                                    skippedFrameIF[skipCounter], prob=1, tol=1e-10,
                                                                    returnResult=False)
            elif typeDict[frameNames[cindex]] == u'string':
                pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex],
                                                                      skippedFrameIF[skipCounter], prob=1,
                                                                      returnResult=False)
            else:
                pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter],
                                                               prob=1, tol=1e-10, returnResult=False)
            skipCounter = skipCounter + 1

    # since we cannot check uuid contents, we at least need to know that the return frame contains the correct column names
    frameNames.extend(uuidNames)
    skippedFrameNames = skippedFrameIF.names

    for skipIndex in skipped_columns:
        assert frameNames[skipIndex] not in skippedFrameNames, \
            "This column: {0}/{1} should have been skipped but is not!".format(frameNames[skipIndex], skipIndex)
def link_functions_tweedie_vpow(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # Load example data from HDtweedie, y = aggregate claim loss
    hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv"))
    y = "y"
    x = list(set(hdf.names()) - set(["y"]))

    print "Testing for family: TWEEDIE"
    print "Create models with canonical link: TWEEDIE"
    # Iterate over different variance powers for tweedie
    vpower = [0, 1, 1.5]
    r_dev = [0.7516627, 0.6708826, 0.7733762]
    r_null = [221051.88369951, 32296.29783702, 20229.47425307]
    for ridx, vpow in enumerate(vpower):
        print "Fit h2o.glm:"
        h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow,
                         alpha=[0.5], Lambda=[0])

        print "Testing Tweedie variance power: {0}".format(vpow)

        print "Compare model deviances for link function tweedie"
        deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance()

        assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                           "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

        print "compare null and residual deviance between R glm and h2o.glm for tweedie"
        assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                                   "{1}".format(h2ofit.null_deviance(), r_null[ridx])
예제 #30
0
def test2():
    df = h2o.upload_file(pyunit_utils.locate("smalldata/jira/pubdev_2020.csv"))
    splits = df.split_frame(ratios=[0.5, 0.25])
    assert df.nrow == splits[0].nrow + splits[1].nrow + splits[2].nrow
    assert splits[0].nrow > 0
    assert splits[1].nrow > 0
    assert splits[2].nrow > 0
예제 #31
0
def cars_checkpoint():

    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"

    # build first model
    model1 = H2ORandomForestEstimator(ntrees=10,max_depth=2, min_rows=10)
    model1.train(x=predictors,y=response_col,training_frame=cars)
    # model1 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=10,max_depth=2, min_rows=10)

    # continue building the model
    model2 = H2ORandomForestEstimator(ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8,
                                      checkpoint=model1._id)
    model2.train(x=predictors,y=response_col,training_frame=cars)
    # model2 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8,
    #                            checkpoint=model1._id)

    #   erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS
    # PUBDEV-1833

    #   mtries
    try:

        model = H2ORandomForestEstimator(mtries=2,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],mtries=2,checkpoint=model1._id)
        assert False, "Expected model-build to fail because mtries not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   sample_rate
    try:
        model = H2ORandomForestEstimator(sample_rate=0.5,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],sample_rate=0.5,checkpoint=model1._id)
        assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins_cats
    try:
        model = H2ORandomForestEstimator(sample_rate=0.5,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins_cats=99,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nbins
    try:
        model = H2ORandomForestEstimator(nbins=99,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins=99,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nbins not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   balance_classes
    try:
        model = H2ORandomForestEstimator(balance_classes=True,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],balance_classes=True,checkpoint=model1._id)
        assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint"
    except EnvironmentError:
        assert True

    #   nfolds
    try:
        model = H2ORandomForestEstimator(nfolds=3,checkpoint=model1._id)
        model.train(x=predictors,y=response_col,training_frame=cars)
        # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nfolds=3,checkpoint=model1._id)
        assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint"
    except EnvironmentError:
        assert True
예제 #32
0
def pub_444_spaces_in_filenames():

    # tempdir = "smalldata/jira/"
    # if was okay to write to smalldata, it's okay to write to the current directory
    # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox?
    tempdir = "./"
    # make a few files with spaces in the name
    f1 = open(tests.locate(tempdir) + "foo .csv", "w")
    f1.write("response, predictor\n")
    for i in range(10):
        f1.write("1, a\n")
        f1.write("0, b\n")
        f1.write("1, a\n" if random.randint(0, 1) else "0, b\n")
    f1.close()

    f2 = open(tests.locate(tempdir) + "b a r .csv", "w")
    f2.write("response, predictor\n")
    for i in range(10):
        f2.write("1, a\n")
        f2.write("0, b\n")
        f2.write("1, a\n" if random.randint(0, 1) else "0, b\n")
    f2.close()

    f3 = open(tests.locate(tempdir) + " ba z.csv", "w")
    for i in range(10):
        f3.write("1, a\n")
        f3.write("0, b\n")
        f3.write("1, a\n" if random.randint(0, 1) else "0, b\n")
    f3.close()

    train_data = h2o.upload_file(path=tests.locate(tempdir + "foo .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:],
                  y=train_data["response"].asfactor(),
                  ntrees=1,
                  distribution="bernoulli",
                  min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=tests.locate(tempdir + "b a r .csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:],
                  y=train_data["response"].asfactor(),
                  ntrees=1,
                  distribution="bernoulli",
                  min_rows=1)
    gbm.show()

    train_data = h2o.upload_file(path=tests.locate(tempdir + " ba z.csv"))
    train_data.show()
    train_data.describe()
    gbm = h2o.gbm(x=train_data[1:],
                  y=train_data[0].asfactor(),
                  ntrees=1,
                  distribution="bernoulli",
                  min_rows=1)
    gbm.show()

    os.remove(tests.locate(tempdir) + "foo .csv")
    os.remove(tests.locate(tempdir) + "b a r .csv")
    os.remove(tests.locate(tempdir) + " ba z.csv")
예제 #33
0
파일: datasets.py 프로젝트: h2oai/mojoland
def iris_frame() -> h2o.H2OFrame:
    frame = h2o.upload_file(_file("iris.csv"))
    assert frame.names == [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species"
    ]
    return frame
def milsong_checkpoint():

    milsong_train = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees1,
                               max_depth=max_depth1,
                               min_rows=min_rows1,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               seed=1234)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(
        model_path
    ), "Expected load directory {0} to exist, but it does not.".format(
        model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,
                               seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],
                               y=milsong_train[0],
                               ntrees=ntrees2,
                               max_depth=max_depth2,
                               min_rows=min_rows2,
                               validation_x=milsong_valid[1:],
                               validation_y=milsong_valid[0],
                               seed=1234)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True))
예제 #35
0
def test_explanation_single_model_regression():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    y = "fare"

    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    gbm = H2OGradientBoostingEstimator(seed=1234, model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)

    # test shap summary
    assert isinstance(
        gbm.shap_summary_plot(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test shap explain row
    assert isinstance(
        gbm.shap_explain_row_plot(train, 1).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test residual analysis
    assert isinstance(
        gbm.residual_analysis_plot(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test pd_plot
    for col in cols_to_test:
        try:
            assert isinstance(
                gbm.pd_plot(train, col).figure(), matplotlib.pyplot.Figure)
        except ValueError:
            assert col == "name", "'name' is a string column which is not supported."

    # test ICE plot
    for col in cols_to_test:
        try:
            assert isinstance(
                gbm.ice_plot(train, col).figure(), matplotlib.pyplot.Figure)
        except ValueError:
            assert col == "name", "'name' is a string column which is not supported."
    matplotlib.pyplot.close("all")

    # test learning curve
    assert isinstance(gbm.learning_curve_plot().figure(),
                      matplotlib.pyplot.Figure)
    for metric in ["auto", "deviance", "rmse"]:
        assert isinstance(
            gbm.learning_curve_plot(metric=metric.upper()).figure(),
            matplotlib.pyplot.Figure)
        assert isinstance(
            gbm.learning_curve_plot(metric).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(gbm.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(gbm.explain_row(train, 1, render=False), H2OExplanation)
예제 #36
0
import h2o
import imp
import math as math
from h2o.estimators.kmeans import H2OKMeansEstimator
h2o.init()

data = h2o.upload_file("/Users/mac/Downloads/versa.csv")
df = data.as_data_frame()
df.head()
data.describe()

cols = ['Flow Key','Type','Rule','Source country','Destination country','User','C26']
df.drop(cols, inplace=True, axis=1)
df.info()
hf = h2o.H2OFrame(df)
hf.describe()


try:
    imp.find_module('pandas')
    can_pandas = True
    import pandas as pd
except:
    can_pandas = False
    
try:
    imp.find_module('seaborn')
    can_seaborn = True
    import seaborn as sns
except:
    can_seaborn = False
예제 #37
0
import sys
sys.path.insert(1, "../../../")
import h2o

h2o.init()

covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data"))
covtype[54] = covtype[54].asfactor()
#dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191], epochs=1, training_frame=covtype,
#                           balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True)

train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
predictors = range(100)
ae_model = h2o.deeplearning(x=train[predictors],
                            training_frame=train,
                            activation="Tanh",
                            autoencoder=True,
                            hidden=[50],
                            l1=1e-5,
                            ignore_const_cols=False,
                            epochs=1)

foo = ae_model.anomaly(covtype)

print foo
# pros = h2o.upload_file(h2o.locate("smalldata/prostate/prostate.csv.zip"))
# pros[1] = pros[1].asfactor()
# r = pros[0].runif() # a column of length pros.nrow() with values between 0 and 1
# # ~80/20 train/validation split
# pros_train = pros[r > .2]
# pros_valid = pros[r <= .2]
예제 #38
0
파일: datasets.py 프로젝트: h2oai/mojoland
def missing_frame() -> h2o.H2OFrame:
    frame = h2o.upload_file(_file("missing.csv"))
    assert frame.shape == (40, 3)
    assert frame.names == ["xCat", "xNum", "response"]
    return frame
예제 #39
0
from io import StringIO

h2o.init()
h2o.cluster().timezone = "America/Los_Angeles"

# Fetch Airlines Dataset from S3
# Airlines Full Dataset 120 GB
data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv"
# Airlines all years 1987-2008 12GB
data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv"
# 2000 Row 4.5 MB
data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
# df = h2o.import_file(data_path)

# # Or use local version
df = h2o.upload_file("./datasets/airlines-allyears2k.csv")
column_names = df.names

# Or ingest from Kafka topic
DATA_TOPIC = 'airlines_stream'
consumer = KafkaConsumer(
    DATA_TOPIC,
    # group_id='h2o-airlines-trainer',
    group_id=None,
    auto_offset_reset='earliest',
    value_deserializer=lambda x: x.decode('utf-8'))

pandas_dfs = []
# No of messages to be included in the DataFrame
n = 3000
i = 0
def random_attack(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    def attack(train, valid, x, y):
        kwargs = {}

        # randomly select parameters and their corresponding values
        if random.randint(0, 1): kwargs['mtries'] = random.randint(1, len(x))
        if random.randint(0, 1): kwargs['sample_rate'] = random.random()
        if random.randint(0, 1): kwargs['build_tree_one_node'] = True
        if random.randint(0, 1): kwargs['ntrees'] = random.randint(1, 10)
        if random.randint(0, 1): kwargs['max_depth'] = random.randint(1, 5)
        if random.randint(0, 1): kwargs['min_rows'] = random.randint(1, 10)
        if random.randint(0, 1): kwargs['nbins'] = random.randint(1, 20)
        if random.randint(0, 1):
            kwargs['balance_classes'] = True
            if random.randint(0, 1):
                kwargs['max_after_balance_size'] = random.uniform(0, 10)
        if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000)
        do_validation = [True, False][random.randint(0, 1)]

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        print "y: {0}".format(y)
        print "validation: {0}".format(do_validation)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            print k + ": {0}".format(v)
        if do_validation:
            h2o.random_forest(x=train[x],
                              y=train[y],
                              validation_x=valid[x],
                              validation_y=valid[y],
                              **kwargs)
        else:
            h2o.random_forest(x=train[x], y=train[y], **kwargs)
        print "-----------------------"

    print "Import and data munging..."
    pros = h2o.upload_file(h2o.locate("smalldata/prostate/prostate.csv.zip"))
    pros[1] = pros[1].asfactor()
    pros[4] = pros[4].asfactor()
    pros[5] = pros[5].asfactor()
    pros[8] = pros[8].asfactor()
    r = pros[0].runif(
    )  # a column of length pros.nrow() with values between 0 and 1
    # ~80/20 train/validation split
    pros_train = pros[r > .2]
    pros_valid = pros[r <= .2]

    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars.csv"))
    r = cars[0].runif()
    cars_train = cars[r > .2]
    cars_valid = cars[r <= .2]

    print
    print "======================================================================"
    print "============================== Binomial =============================="
    print "======================================================================"
    for i in range(10):
        attack(pros_train, pros_valid,
               random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1)

    print
    print "======================================================================"
    print "============================== Gaussian =============================="
    print "======================================================================"
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1)

    print
    print "======================================================================"
    print "============================= Multinomial ============================"
    print "======================================================================"
    cars_train[2] = cars_train[2].asfactor()
    cars_valid[2] = cars_valid[2].asfactor()
    for i in range(10):
        attack(cars_train, cars_valid,
               random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)
def glrm_pubdev_3756_arrest():
    print("Importing prostate.csv data...")

    # frame binary data is read in as enums.  Let's see if it runs.
    prostateF = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostateF_num[0] = prostateF_num[0].asnumeric()
    prostateF_num[4] = prostateF_num[4].asnumeric()

    loss_all = [
        "Hinge", "Quadratic", "Categorical", "Categorical", "Hinge",
        "Quadratic", "Quadratic", "Quadratic"
    ]

    print("check with init = plusplus")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345)
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345)
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = random")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345,
                                              init="random")
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345,
                                                  init="random")
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = SVD")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345,
                                              init="SVD")
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345,
                                                  init="SVD")
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    print("check with init = user")
    initial_y = [[
        -1.27675647831893E-15, 64.87421383647799, 2.0, 1.0,
        2.0816681711721685E-16, 8.533270440251574, 9.380440251572328,
        5.886792452830188
    ],
                 [
                     0.7297297297297298, 66.05405405405405, 2.0, 0.0, 1.0,
                     23.270270270270274, 9.589189189189193, 7.27027027027027
                 ],
                 [
                     0.01754385964912314, 70.35087719298245, 2.0, 1.0,
                     -1.3877787807814457E-17, 10.078947368421053,
                     42.37543859649123, 6.157894736842105
                 ],
                 [0.9, 65.95, 2.0, 0.0, 0.2, 81.94500000000001, 16.375, 7.4],
                 [
                     0.9999999999999989, 65.48598130841121, 2.0, 3.0,
                     1.3877787807814457E-16, 13.3092523364486,
                     13.268411214953275, 6.747663551401869
                 ]]
    initial_y_h2o = h2o.H2OFrame(list(initial_y))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss_by_col=loss_all,
                                              recover_svd=True,
                                              transform="STANDARDIZE",
                                              seed=12345,
                                              init="User",
                                              user_y=initial_y_h2o)
    glrm_h2o.train(x=prostateF.names,
                   training_frame=prostateF,
                   validation_frame=prostateF)
    glrm_h2o.show()

    # exercise logistic loss with numeric columns
    glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5,
                                                  loss_by_col=loss_all,
                                                  recover_svd=True,
                                                  transform="STANDARDIZE",
                                                  seed=12345,
                                                  init="User",
                                                  user_y=initial_y_h2o)
    glrm_h2o_num.train(x=prostateF_num.names,
                       training_frame=prostateF_num,
                       validation_frame=prostateF_num)
    glrm_h2o_num.show()

    # singular values from glrm models should equal if binary columns with binary loss are read in as either
    # categorical or numerics.  If not, something is wrong.
    assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"],
                                         glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \
        "Singular values obtained from logistic loss with column type as enum and numeric do not agree.  Fix it now."

    sys.stdout.flush()
def stackedensemble_guassian_test():
    """This test check the following (for guassian regression):
    1) That H2OStackedEnsembleEstimator executes w/o errors on a 3-model manually constructed ensemble.
    2) That .predict() works on a stack.
    3) That .model_performance() works on a stack.
    4) That the training and test performance is better on ensemble vs the base learners.
    5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly.
    """

    col_types = [
        "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric",
        "numeric", "numeric"
    ]
    dat = h2o.upload_file(
        path=pyunit_utils.locate("smalldata/extdata/prostate.csv"),
        destination_frame="prostate_hex",
        col_types=col_types)
    train, test = dat.split_frame(ratios=[.8], seed=1)
    print(train.summary())

    # Identify predictors and response
    x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"]
    y = "AGE"

    # set number of folds
    nfolds = 5

    # train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(
        distribution="gaussian",
        max_depth=3,
        learn_rate=0.2,
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # evaluate the performance
    perf_gbm_train = my_gbm.model_performance(train=True)
    perf_gbm_test = my_gbm.model_performance(test_data=test)
    print("GBM training performance: ")
    print(perf_gbm_train)
    print("GBM test performance: ")
    print(perf_gbm_test)

    # train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=30,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)

    my_rf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_rf_train = my_rf.model_performance(train=True)
    perf_rf_test = my_rf.model_performance(test_data=test)
    print("RF training performance: ")
    print(perf_rf_train)
    print("RF test performance: ")
    print(perf_rf_test)

    # Train and cross-validate an extremely-randomized RF
    my_xrf = H2ORandomForestEstimator(ntrees=50,
                                      nfolds=nfolds,
                                      histogram_type="Random",
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)

    my_xrf.train(x=x, y=y, training_frame=train)

    # evaluate performance
    perf_xrf_train = my_xrf.model_performance(train=True)
    perf_xrf_test = my_xrf.model_performance(test_data=test)
    print("XRF training performance: ")
    print(perf_xrf_train)
    print("XRF test performance: ")
    print(perf_xrf_test)

    # Train a stacked ensemble using the GBM and GLM above
    stack = H2OStackedEnsembleEstimator(
        model_id="my_ensemble_guassian",
        base_models=[my_gbm.model_id, my_rf.model_id, my_xrf.model_id])

    stack.train(
        x=x, y=y, training_frame=train,
        validation_frame=test)  # also test that validation_frame is working

    # Check that prediction works
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 1, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Does predict() have ugly side effects?
    pred = stack.predict(test_data=test)
    assert pred.nrow == test.nrow, "expected " + str(
        pred.nrow) + " to be equal to " + str(test.nrow)
    assert pred.ncol == 1, "expected " + str(
        pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol)

    # Evaluate ensemble performance
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Does performance() have ugly side effects?
    perf_stack_train = stack.model_performance()
    perf_stack_test = stack.model_performance(test_data=test)

    # Training RMSE for each base learner
    baselearner_best_rmse_train = min(perf_gbm_train.rmse(),
                                      perf_rf_train.rmse(),
                                      perf_xrf_train.rmse())
    stack_rmse_train = perf_stack_train.rmse()
    print("Best Base-learner Training RMSE:  {0}".format(
        baselearner_best_rmse_train))
    print("Ensemble Training RMSE:  {0}".format(stack_rmse_train))
    #assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \
    #                                                     " found it wasn't baselearner_best_rmse_train"

    # Check that stack perf is better (smaller) than the best (smaller) base learner perf:
    # Test RMSE for each base learner
    baselearner_best_rmse_test = min(perf_gbm_test.rmse(), perf_rf_test.rmse(),
                                     perf_xrf_test.rmse())
    stack_rmse_test = perf_stack_test.rmse()
    print(
        "Best Base-learner Test RMSE:  {0}".format(baselearner_best_rmse_test))
    print("Ensemble Test RMSE:  {0}".format(stack_rmse_test))
    assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \
                                                       " baselearner_best_rmse_test, found it wasn't  " \
                                                       "baselearner_best_rmse_test = "+ \
                                                       str(baselearner_best_rmse_test) + ",stack_rmse_test " \
                                                                                              " = "+ str(stack_rmse_test)

    # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test)
    # since the metrics object is not exactly the same, we can just test that RSME is the same
    perf_stack_validation_frame = stack.model_performance(valid=True)
    assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \
                                                                "perf_stack_validation_frame.rmse() found they were not " \
                                                                "perf_stack_validation_frame.rmse() = " + \
                                                                str(perf_stack_validation_frame.rmse()) + \
                                                                "stack_rmse_test was " + str(stack_rmse_test)
def pca_scoring_history_importance():
    """
    This test aims to check and make sure PCA returns the scoring history and importance which are
    reported missing for certain PCA mode.  Apart from changing the PCA mode, I throw in the transform
    type to test as well randomly.
    """
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]

    print("Importing australia.csv data...\n")
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/extdata/australia.csv"))
    col_indices = list(range(0, australia.ncol))

    print("transform is {0}.\n".format(transformN))
    # checking out PCA with GramSVD
    print("@@@@@@  Building PCA with GramSVD...\n")
    gramSVD = H2OPCA(k=3, transform=transformN)
    gramSVD.train(x=col_indices, training_frame=australia)

    # check PCA with PCA set to Randomized
    print("@@@@@@  Building PCA with Randomized...\n")
    randomizedPCA = H2OPCA(k=3,
                           transform=transformN,
                           pca_method="Randomized",
                           compute_metrics=True,
                           use_all_factor_levels=True)
    randomizedPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        randomizedPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-3)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Randomized...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["eigenvectors"],
        randomizedPCA._model_json["output"]["names"],
        tolerance=5e-2,
        check_sign=True)

    # check PCA with PCA set to Power
    print("@@@@@@  Building PCA with Power...\n")
    powerPCA = H2OPCA(k=3,
                      transform=transformN,
                      pca_method="Power",
                      compute_metrics=True,
                      use_all_factor_levels=True)
    powerPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=1e-5,
        check_sign=True)

    # check PCA with PCA set to GLRM
    print("@@@@@@  Building PCA with GLRM...\n")
    glrmPCA = H2OPCA(k=3,
                     transform=transformN,
                     pca_method="GLRM",
                     compute_metrics=True,
                     use_all_factor_levels=True)
    glrmPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-2)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=1e-1,
        check_sign=True)

    # make sure we find the scoring history and it is not empty for all the PCA modes
    # just check and make sure the cell_values exceed 0
    assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                "pca_method to GramSVD is empty."
    assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                 "pca_method to using is empty."
    assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                      "pca_method to Randomized is " \
                                                                                      "empty."
    assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                  "pca_method to GLRM is empty."
예제 #44
0
def random_attack():
    def attack(family, train, valid, x, y):
        kwargs = {}
        kwargs['family'] = family
        gaussian_links = ["inverse", "log", "identity"]
        binomial_links = ["logit"]
        poisson_links = ["log", "identity"]
        gamma_links = ["inverse", "log", "identity"]

        # randomly select parameters and their corresponding values
        if random.randint(0, 1):
            kwargs['max_iterations'] = random.randint(1, 50)
        if random.random() > 0.8: kwargs['beta_epsilon'] = random.random()
        if random.randint(0, 1):
            kwargs['solver'] = [
                "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE",
                "COORDINATE_DESCENT"
            ][random.randint(0, 1)]
        if random.randint(0, 1):
            kwargs['standardize'] = [True, False][random.randint(0, 1)]
        if random.randint(0, 1):
            if family == "gaussian":
                kwargs['link'] = gaussian_links[random.randint(0, 2)]
            elif family == "binomial":
                kwargs['link'] = binomial_links[random.randint(0, 0)]
            elif family == "poisson":
                kwargs['link'] = poisson_links[random.randint(0, 1)]
            elif family == "gamma":
                kwargs['link'] = gamma_links[random.randint(0, 2)]
        if random.randint(0, 1): kwargs['alpha'] = [random.random()]
        if family == "binomial":
            if random.randint(0, 1): kwargs['prior'] = random.random()
        if random.randint(0, 1):
            kwargs['lambda_search'] = [True, False][random.randint(0, 1)]
        if 'lambda_search' in list(kwargs.keys()):
            if random.randint(0, 1): kwargs['nlambdas'] = random.randint(2, 10)
        do_validation = [True, False][random.randint(0, 1)]
        # beta constraints
        if random.randint(0, 1):
            bc = []
            for n in x:
                if train[n].isnumeric():
                    name = train.names[n]
                    lower_bound = random.uniform(-1, 1)
                    upper_bound = lower_bound + random.random()
                    bc.append([name, lower_bound, upper_bound])
            if len(bc) > 0:
                beta_constraints = h2o.H2OFrame(bc)
                beta_constraints.set_names(
                    ['names', 'lower_bounds', 'upper_bounds'])
                kwargs['beta_constraints'] = beta_constraints.frame_id

        # display the parameters and their corresponding values
        print("-----------------------")
        print("x: {0}".format(x))
        print("y: {0}".format(y))
        print("validation: {0}".format(do_validation))
        for k, v in zip(list(kwargs.keys()), list(kwargs.values())):
            if k == 'beta_constraints':
                print(k + ": ")
                beta_constraints.show()
            else:
                print(k + ": {0}".format(v))
        if do_validation:
            #         h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs)
            H2OGeneralizedLinearEstimator(**kwargs).train(
                x=x, y=y, training_frame=train, validation_frame=valid)
        else:
            #         h2o.glm(x=train[x], y=train[y], **kwargs)
            H2OGeneralizedLinearEstimator(**kwargs).train(x=x,
                                                          y=y,
                                                          training_frame=train)
        print("-----------------------")

    print("Import and data munging...")
    seed = random.randint(1, 10000)
    print("SEED: {0}".format(seed))
    pros = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv.zip"))
    pros[1] = pros[1].asfactor()
    r = pros[0].runif(
        seed=seed)  # a column of length pros.nrow with values between 0 and 1
    # ~80/20 train/validation split
    pros_train = pros[r > .2]
    pros_valid = pros[r <= .2]

    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars.csv"))
    r = cars[0].runif(seed=seed)
    cars_train = cars[r > .2]
    cars_valid = cars[r <= .2]

    print()
    print(
        "======================================================================"
    )
    print(
        "============================== Binomial =============================="
    )
    print(
        "======================================================================"
    )
    for i in range(10):
        attack("binomial", pros_train, pros_valid,
               random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1)

    print()
    print(
        "======================================================================"
    )
    print(
        "============================== Gaussian =============================="
    )
    print(
        "======================================================================"
    )
    for i in range(10):
        attack("gaussian", cars_train, cars_valid,
               random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1)

    print()
    print(
        "======================================================================"
    )
    print(
        "============================== Poisson  =============================="
    )
    print(
        "======================================================================"
    )
    for i in range(10):
        attack("poisson", cars_train, cars_valid,
               random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)

    print()
    print(
        "======================================================================"
    )
    print(
        "==============================  Gamma   =============================="
    )
    print(
        "======================================================================"
    )
    for i in range(10):
        attack("gamma", pros_train, pros_valid,
               random.sample([1, 2, 3, 5, 6, 7, 8], random.randint(1, 7)), 4)
예제 #45
0
import h2o
h2o.init()
#load data
train_set = h2o.upload_file("train.csv")
test_set = h2o.upload_file("test.csv")
#Define X and y
y = "label"
X = list(set(train_set.col_names) - set(["label"]))
train_set[y] = train_set[y].asfactor()
from h2o.estimators import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch
#grid search and k-fold
hidden_opt = [[32, 32], [32, 16, 8], [100]]
l1_opt = [1e-4, 1e-3]
hyper_parameters = {"hidden": hidden_opt, "l1": l1_opt}
model_grid = H2OGridSearch(H2ODeepLearningEstimator,
                           hyper_params=hyper_parameters)
model_grid.train(x=X,
                 y=y,
                 distribution="multinomial",
                 epochs=1000,
                 training_frame=train_set,
                 nfolds=5,
                 stopping_rounds=3,
                 stopping_tolerance=0.05,
                 stopping_metric="misclassification")
#get the best model
gs = model_grid.sort_by("mse")
best = h2o.get_model(
    "Grid_DeepLearning_py_2_model_python_1459310941902_2_model_4")
pred = best.predict(test_set)
예제 #46
0
def impute2(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    prostate = h2o.upload_file(
        h2o.locate("smalldata/logreg/prostate_missing.csv"))
    methods = ["mean", "median", "mode"]
    combine_methods = ["interpolate", "average", "low", "high"]
    inplace = [False, True]

    for inpl in inplace:
        for method in methods:
            for combine_method in combine_methods:
                h2o.impute(prostate,
                           "DPROS",
                           method=method,
                           combine_method=combine_method,
                           inplace=inpl)

    air = h2o.upload_file(
        h2o.locate("smalldata/airlines/allyears2k_headers.zip"))
    for inpl in inplace:
        for method in methods:
            for combine_method in combine_methods:
                if method == "mode":
                    h2o.impute(air,
                               "TailNum",
                               method=method,
                               combine_method=combine_method,
                               inplace=inpl)
                else:
                    try:
                        h2o.impute(air,
                                   "TailNum",
                                   method=method,
                                   combine_method=combine_method,
                                   inplace=inpl)
                        assert False, "only \"mode\" method allowed for categorical column, but {0} was allowed here".\
                            format(method)
                    except ValueError:
                        assert True

    data = [[None, 2, 3, 1, 'a', 1, 9], [1, None, 4, 2, 'a', 1, 9],
            [2, 3, None, None, 'b', 1, 9], [3, 4, None, None, 'b', 3, 8],
            [4, 5, 9, 5, None, 2, 8], [5, None, 10, 7, 'b', None, 8]]
    h2o_data = h2o.H2OFrame(python_obj=data)

    # mean check
    h2o.impute(h2o_data, column="C1", method="mean")
    c1_imputed = h2o_data[0, 0]
    assert c1_imputed == 3, "Wrong value imputed. Expected imputed value of 3, but got {0}".format(
        c1_imputed)

    # inplace check
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C1", method="mean", inplace=False)
    assert h2o_data["C1"].isna().sum(
    ) == 1, "Expected imputation to be done in place."

    # median-average
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data,
               column="C2",
               method="median",
               combine_method="average")
    c2_imputed = h2o_data[1, 1]
    assert c2_imputed == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(
        c2_imputed)

    # median-low
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C3", method="median", combine_method="low")
    c3_imputed = h2o_data[2, 2]
    assert c3_imputed == 4, "Wrong value imputed. Expected imputed value of 4, but got {0}".format(
        c3_imputed)

    # median-high
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C4", method="median", combine_method="high")
    c4_imputed = h2o_data[2, 3]
    assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format(
        c4_imputed)

    # mode-categorical
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C5", method="mode")
    c5_imputed = h2o_data[4, 4]
    assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format(
        c5_imputed)

    # mode-numeric
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C6", method="mode")
    c6_imputed = h2o_data[5, 5]
    assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format(
        c6_imputed)

    # mean-group by C7
    h2o_data = h2o.H2OFrame(python_obj=data)
    h2o.impute(h2o_data, column="C3", method="mean", by=["C7"])
    imputed1 = h2o_data[2, 2]
    imputed2 = h2o_data[3, 2]
    assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(
        imputed1)
    assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format(
        imputed2)
def glrm_arrests_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()

    print("Importing USArrests.csv data and saving for validation...")
    arrests_full = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests_full.describe()
    totobs = arrests_full.nrow * arrests_full.ncol
    train_err = [0] * len(missing_ratios)
    valid_err = [0] * len(missing_ratios)

    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing USArrests.csv and inserting {0}% missing entries".
              format(100 * ratio))
        arrests_miss = h2o.upload_file(
            pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
        arrests_miss = arrests_miss.insert_missing_values(fraction=ratio)
        arrests_miss.describe()

        print("H2O GLRM with {0}% missing entries".format(100 * ratio))
        arrests_glrm = H2OGeneralizedLowRankEstimator(k=4,
                                                      ignore_const_cols=False,
                                                      loss="Quadratic",
                                                      regularization_x="None",
                                                      regularization_y="None",
                                                      init="PlusPlus",
                                                      max_iterations=10,
                                                      min_step_size=1e-6)
        arrests_glrm.train(x=arrests_miss.names,
                           training_frame=arrests_miss,
                           validation_frame=arrests_full)
        arrests_glrm.show()

        # Check imputed data and error metrics
        glrm_obj = arrests_glrm._model_json['output']['objective']
        train_numerr = arrests_glrm._model_json['output'][
            'training_metrics']._metric_json['numerr']
        train_caterr = arrests_glrm._model_json['output'][
            'training_metrics']._metric_json['caterr']
        valid_numerr = arrests_glrm._model_json['output'][
            'validation_metrics']._metric_json['numerr']
        valid_caterr = arrests_glrm._model_json['output'][
            'validation_metrics']._metric_json['caterr']
        assert abs(train_numerr - glrm_obj
                   ) < 1e-3, "Numeric error on training data was " + str(
                       train_numerr
                   ) + " but should equal final objective " + str(glrm_obj)
        assert train_caterr == 0, "Categorical error on training data was " + str(
            train_caterr) + " but should be zero"
        assert valid_caterr == 0, "Categorical error on validation data was " + str(
            valid_caterr) + " but should be zero"

        train_numcnt = arrests_glrm._model_json['output'][
            'training_metrics']._metric_json['numcnt']
        valid_numcnt = arrests_glrm._model_json['output'][
            'validation_metrics']._metric_json['numcnt']
        assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data"
        assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(
            valid_numcnt) + " but should be " + str(totobs)

        train_err[i] = train_numerr
        valid_err[i] = valid_numerr
        # h2o.remove(arrests_glrm._model_json['output']['loading_key']['name'])

    for i in range(len(missing_ratios)):
        print(
            "Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}"
            .format(missing_ratios[i] * 100, train_err[i], valid_err[i]))
예제 #48
0
def glrm_prostate_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()

    print "Importing prostate_cat.csv data and saving for validation..."
    prostate_full = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
        na_strings=["NA"] * 8)
    prostate_full.describe()
    totnas = 0
    for i in range(prostate_full.ncol):
        totnas = totnas + prostate_full[i].isna().sum()
    totobs = prostate_full.nrow * prostate_full.ncol - totnas

    train_numerr = [0] * len(missing_ratios)
    valid_numerr = [0] * len(missing_ratios)
    train_caterr = [0] * len(missing_ratios)
    valid_caterr = [0] * len(missing_ratios)

    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print "Importing prostate_cat.csv and inserting {0}% missing entries".format(
            100 * ratio)
        prostate_miss = h2o.upload_file(
            pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
        prostate_miss = prostate_miss.insert_missing_values(fraction=ratio)
        prostate_miss.describe()

        print "H2O GLRM with {0}% missing entries".format(100 * ratio)
        prostate_glrm = H2OGeneralizedLowRankEstimator(k=8,
                                                       ignore_const_cols=False,
                                                       loss="Quadratic",
                                                       gamma_x=0.5,
                                                       gamma_y=0.5,
                                                       regularization_x="L1",
                                                       regularization_y="L1",
                                                       init="SVD",
                                                       max_iterations=2000,
                                                       min_step_size=1e-6)
        prostate_glrm.train(x=range(8),
                            training_frame=prostate_miss,
                            validation_frame=prostate_full)

        prostate_glrm.show()

        # Check imputed data and error metrics
        train_numcnt = prostate_glrm._model_json['output'][
            'training_metrics']._metric_json['numcnt']
        valid_numcnt = prostate_glrm._model_json['output'][
            'validation_metrics']._metric_json['numcnt']
        train_catcnt = prostate_glrm._model_json['output'][
            'training_metrics']._metric_json['catcnt']
        valid_catcnt = prostate_glrm._model_json['output'][
            'validation_metrics']._metric_json['catcnt']
        assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data"
        assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data"
        assert (
            train_numcnt + valid_numcnt
        ) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(
            train_numcnt + valid_numcnt, totobs)
        assert (
            valid_numcnt + valid_catcnt
        ) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(
            valid_numcnt + valid_catcnt, totobs)

        train_numerr[i] = prostate_glrm._model_json['output'][
            'training_metrics']._metric_json['numerr']
        valid_numerr[i] = prostate_glrm._model_json['output'][
            'validation_metrics']._metric_json['numerr']
        train_caterr[i] = prostate_glrm._model_json['output'][
            'training_metrics']._metric_json['caterr']
        valid_caterr[i] = prostate_glrm._model_json['output'][
            'validation_metrics']._metric_json['caterr']
        # h2o.remove(prostate_glrm._model_json['output']['loading_key']['name'])

    for i in range(len(missing_ratios)):
        print "Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(
            missing_ratios[i] * 100, train_numerr[i], valid_numerr[i])

    for i in range(len(missing_ratios)):
        print "Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(
            missing_ratios[i] * 100, train_caterr[i], valid_caterr[i])
    return "../examples/smalldata/" + example_name


conf = SparkConf().setIfMissing("spark.master",
                                os.getenv("spark.master", "local[*]"))
spark = SparkSession.builder.appName("ChicagoCrimeTest").config(
    conf=conf).getOrCreate()
# Start H2O services
h2oContext = H2OContext.getOrCreate(spark)
# Define file names
chicagoAllWeather = "chicagoAllWeather.csv"
chicagoCensus = "chicagoCensus.csv"
chicagoCrimes10k = "chicagoCrimes10k.csv"

# h2o.import_file expects cluster-relative path
f_weather = h2o.upload_file(_locate(chicagoAllWeather))
f_census = h2o.upload_file(_locate(chicagoCensus))
f_crimes = h2o.upload_file(_locate(chicagoCrimes10k),
                           col_types={"Date": "string"})

# Transform weather table
# Remove 1st column (date)
f_weather = f_weather[1:]

# Transform census table
# Remove all spaces from column names (causing problems in Spark SQL)
col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'),
                f_census.col_names)

# Update column names in the table
# f_weather.names = col_names
예제 #50
0
def gbm_demo(interactive, echo, test):
    h2o_data_path = system_file("prostate.csv")

    demo_description = ['\n-----------------------------------------------------------------',
                        'This is a demo of H2O\'s GBM function.',
                        'It uploads a dataset to h2o, parses it, and shows a description.',
                        'Then, it divides the dataset into training and test sets, ',
                        'builds a GBM from the training set, and predicts on the test set.',
                        'Finally, default performance metrics are displayed.',
                        '-----------------------------------------------------------------']

    demo_commands = ['# Connect to h2o',
                     '>>> h2o.init()\n',
                     '\n# Upload the prostate dataset that comes included in the h2o python package',
                     '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n',
                     '\n# Print a description of the prostate data',
                     '>>> prostate.summary()\n',
                     '\n# Randomly split the dataset into ~70/30, training/test sets',
                     '>>> r = prostate[0].runif()',
                     '>>> train = prostate[r < 0.70]',
                     '>>> valid = prostate[r >= 0.30]\n',
                     '\n# Convert the response columns to factors (for binary classification problems)',
                     '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()',
                     '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n',
                     '\n# Build a (classification) GBM',
                     '>>> prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], '
                     'y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, '
                     'learn_rate=0.2)\n',
                     '\n# Show the model',
                     '>>> prostate_gbm.show()\n',
                     '\n# Predict on the test set and show the first ten predictions',
                     '>>> predictions = prostate_gbm.predict(test)',
                     '>>> predictions.show()\n',
                     '\n# Show default performance metrics',
                     '>>> performance = prostate_gbm.model_performance(test)',
                     '>>> performance.show()\n']

    for line in demo_description: print line
    print

    echo_and_interact(demo_commands, interactive, echo)
    if not test: h2o.init()

    echo_and_interact(demo_commands, interactive, echo)
    prostate = h2o.upload_file(path = h2o_data_path)

    echo_and_interact(demo_commands, interactive, echo)
    prostate.summary()

    echo_and_interact(demo_commands, interactive, echo, npop=4)
    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    echo_and_interact(demo_commands, interactive, echo)
    prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], y=train["CAPSULE"],
                           distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2)

    echo_and_interact(demo_commands, interactive, echo)
    prostate_gbm.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    predictions = prostate_gbm.predict(test)
    predictions.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    performance = prostate_gbm.model_performance(test)
    performance.show()
예제 #51
0
def deeplearning_demo(interactive, echo, test):
    h2o_data_path = system_file("prostate.csv")

    demo_description = ['\n-----------------------------------------------------------------',
                        'This is a demo of H2O\'s Deeplearning function.',
                        'It uploads a dataset to h2o, parses it, and shows a description.',
                        'Then, it divides the dataset into training and test sets, ',
                        'builds a model from the training set, and predicts on the test set.',
                        'Finally, default performance metrics are displayed.',
                        '-----------------------------------------------------------------']

    demo_commands = ['# Connect to h2o',
                     '>>> h2o.init()\n',
                     '\n# Upload the prostate dataset that comes included in the h2o python package',
                     '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n',
                     '\n# Print a description of the prostate data',
                     '>>> prostate.summary()\n',
                     '\n# Randomly split the dataset into ~70/30, training/test sets',
                     '>>> r = prostate[0].runif()',
                     '>>> train = prostate[r < 0.70]',
                     '>>> valid = prostate[r >= 0.30]\n',
                     '\n# Convert the response columns to factors (for binary classification problems)',
                     '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()',
                     '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n',
                     '\n# Build a (classification) Deeplearning model',
                     '>>> prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))]'
                     ', y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000)\n',
                     '\n# Show the model',
                     '>>> prostate_dl.show()\n',
                     '\n# Predict on the test set and show the first ten predictions',
                     '>>> predictions = prostate_dl.predict(test)',
                     '>>> predictions.show()\n',
                     '\n# Show default performance metrics',
                     '>>> performance = prostate_dl.model_performance(test)',
                     '>>> performance.show()\n']

    for line in demo_description: print line
    print

    echo_and_interact(demo_commands, interactive, echo)
    if not test: h2o.init()

    echo_and_interact(demo_commands, interactive, echo)
    prostate = h2o.upload_file(path = h2o_data_path)

    echo_and_interact(demo_commands, interactive, echo)
    prostate.summary()

    echo_and_interact(demo_commands, interactive, echo, npop=4)
    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    echo_and_interact(demo_commands, interactive, echo)
    prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))], y=train["CAPSULE"],
                                   activation="Tanh", hidden=[10, 10, 10], epochs=10000)

    echo_and_interact(demo_commands, interactive, echo)
    prostate_dl.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    predictions = prostate_dl.predict(test)
    predictions.show()

    echo_and_interact(demo_commands, interactive, echo, npop=3)
    performance = prostate_dl.model_performance(test)
    performance.show()
def weights_and_biases():

    print(
        "Test checks if Deep Learning weights and biases are accessible from R"
    )

    covtype = h2o.upload_file(
        pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    dlmodel = H2ODeepLearningEstimator(hidden=[17, 191],
                                       epochs=1,
                                       balance_classes=False,
                                       reproducible=True,
                                       seed=1234,
                                       export_weights_and_biases=True)
    dlmodel.train(x=list(range(54)), y=54, training_frame=covtype)
    print(dlmodel)

    weights1 = dlmodel.weights(0)
    weights2 = dlmodel.weights(1)
    weights3 = dlmodel.weights(2)

    biases1 = dlmodel.biases(0)
    biases2 = dlmodel.biases(1)
    biases3 = dlmodel.biases(2)

    w1c = weights1.ncol
    w1r = weights1.nrow
    assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(
        52, w1c)
    assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(
        17, w1r)

    w2c = weights2.ncol
    w2r = weights2.nrow
    assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(
        17, w2c)
    assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(
        191, w2r)

    w3c = weights3.ncol
    w3r = weights3.nrow
    assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(
        191, w3c)
    assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(
        7, w3r)

    b1c = biases1.ncol
    b1r = biases1.nrow
    assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(
        1, b1c)
    assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(
        17, b1r)

    b2c = biases2.ncol
    b2r = biases2.nrow
    assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(
        1, b2c)
    assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(
        191, b2r)

    b3c = biases3.ncol
    b3r = biases3.nrow
    assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(
        1, b3c)
    assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(
        7, b3r)

    df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    dl1 = H2ODeepLearningEstimator(hidden=[10, 10],
                                   export_weights_and_biases=True)
    dl1.train(x=list(range(4)), y=4, training_frame=df)
    p1 = dl1.predict(df)
    ll1 = dl1.model_performance(df).logloss()
    print(ll1)

    ## get weights and biases
    w1 = dl1.weights(0)
    w2 = dl1.weights(1)
    w3 = dl1.weights(2)
    b1 = dl1.biases(0)
    b2 = dl1.biases(1)
    b3 = dl1.biases(2)

    ## make a model from given weights/biases
    dl2 = H2ODeepLearningEstimator(hidden=[10, 10],
                                   initial_weights=[w1, w2, w3],
                                   initial_biases=[b1, b2, b3],
                                   epochs=0)
    dl2.train(x=list(range(4)), y=4, training_frame=df)
    p2 = dl2.predict(df)
    ll2 = dl2.model_performance(df).logloss()
    print(ll2)

    # h2o.download_pojo(dl2) ## fully functional pojo

    ## check consistency
    assert abs(p1[:, 1:4] - p2[:, 1:4]).max() < 1e-6
    assert abs(ll2 - ll1) < 1e-6

    ## make another model with partially set weights/biases
    dl3 = H2ODeepLearningEstimator(hidden=[10, 10],
                                   initial_weights=[w1, None, w3],
                                   initial_biases=[b1, b2, None],
                                   epochs=10)
    dl3.train(x=list(range(4)), y=4, training_frame=df)
    ll3 = dl3.model_performance(df).logloss()

    ## make another model with partially set user-modified weights/biases
    dl4 = H2ODeepLearningEstimator(
        hidden=[10, 10],
        initial_weights=[w1 * 1.1, w2 * 0.9, w3.sqrt()],
        initial_biases=[b1, b2, None],
        epochs=10)
    dl4.train(x=list(range(4)), y=4, training_frame=df)
    ll4 = dl4.model_performance(df).logloss()
def deeplearning_autoencoder():

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(0)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.pop(resp)
    #train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    #train_supervised.describe()

    # train autoencoder
    ae_model = H2OAutoEncoderEstimator(activation="Tanh",
                                       hidden=[nfeatures],
                                       model_id="ae_model",
                                       epochs=1,
                                       ignore_const_cols=False,
                                       reproducible=True,
                                       seed=1234)

    ae_model.train(list(range(resp)), training_frame=train_unsupervised)

    # convert train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp],
                                                      0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    train_supervised_features = train_supervised_features.cbind(
        train_supervised[resp])

    # Train DRF on extracted feature space
    drf_model = H2ORandomForestEstimator(ntrees=10, min_rows=10, seed=1234)
    drf_model.train(x=list(range(20)),
                    y=train_supervised_features.ncol - 1,
                    training_frame=train_supervised_features)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp], 0)
    test_features = test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 8.8% error +/- 1%
    #compare to runit_deeplearning_autoencoder_large.py
    assert abs(cm.cell_values[10][10] -
               0.088) < 0.01, "Error. Expected 0.088, but got {0}".format(
                   cm.cell_values[10][10])

    ## Another usecase: Use pretrained unsupervised autoencoder model to initialize a supervised Deep Learning model
    pretrained_model = H2ODeepLearningEstimator(
        activation="Tanh",
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,
        seed=1234,
        ignore_const_cols=False,
        pretrained_autoencoder="ae_model")
    pretrained_model.train(list(range(resp)),
                           resp,
                           training_frame=train_supervised,
                           validation_frame=test_hex)
    print(pretrained_model.logloss(train=False, valid=True))

    model_from_scratch = H2ODeepLearningEstimator(activation="Tanh",
                                                  hidden=[nfeatures],
                                                  epochs=1,
                                                  reproducible=True,
                                                  seed=1234,
                                                  ignore_const_cols=False)
    model_from_scratch.train(list(range(resp)),
                             resp,
                             training_frame=train_supervised,
                             validation_frame=test_hex)
    print(model_from_scratch.logloss(train=False, valid=True))

    assert pretrained_model.logloss(
        train=False, valid=True
    ) < model_from_scratch.logloss(
        train=False, valid=True
    ), "Error. Pretrained model should lead to lower logloss than training from scratch."
def glrm_set_loss_by_col_rand():
    NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"]
    CAT_LOSS = ["Categorical", "Ordinal"]
    NUM_COLS = [1, 5, 6, 7]
    CAT_COLS = [0, 2, 3, 4]

    print "Importing prostate_cat.csv data..."
    prostateH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
        na_strings=["NA"] * 8)
    prostateH2O.describe()

    # Fully specify every column's loss function (no need for loss_by_col_idx)
    loss_all = [
        rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS,
                                                                    k=1)[0]
        for x in xrange(0, 8)
    ]
    print "Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]"
    glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
    glrm_h2o.show()

    # Randomly set columns and loss functions
    cat_size = rd.sample(xrange(1, 5), 1)
    num_size = rd.sample(xrange(1, 5), 1)
    cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False)
    num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False)
    loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True)
    loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True)

    loss_idx_all = cat_idx.tolist() + num_idx.tolist()
    loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist()
    loss_combined = zip(
        loss_all,
        loss_idx_all)  # Permute losses and indices in same way for testing
    rd.shuffle(loss_combined)
    loss_all[:], loss_idx_all[:] = zip(*loss_combined)

    if (len(loss_all) < prostateH2O.ncol):
        try:
            m = H2OGeneralizedLowRankEstimator
            h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
            assert False, "Expected GLRM to throw error since column indices not specified"
        except:
            pass

    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all)
        assert False, "Expected GLRM to throw error since losses for columns not specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=["Absolute", "Ordinal", "Huber"],
                 loss_by_col_idx=[1, 2])
        assert False, "Expected GLRM to throw error since not all column indices specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=["Absolute", "Ordinal"],
                 loss_by_col_idx=[1, 2, 5])
        assert False, "Expected GLRM to throw error since not all losses for columns specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8)
        assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(NUM_LOSS, 1),
                 loss_by_col_idx=rd.sample(CAT_COLS, 1))
        assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(CAT_LOSS, 1),
                 loss_by_col_idx=rd.sample(NUM_COLS, 1))
        assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column"
    except:
        pass

    print "Run GLRM with loss_by_col = [" + ', '.join(
        loss_all) + "] and loss_by_col_idx = [" + ', '.join(
            [str(a) for a in loss_idx_all]) + "]"
    glrm_h2o = h2o.glrm(x=prostateH2O,
                        k=5,
                        loss_by_col=loss_all,
                        loss_by_col_idx=loss_idx_all)
    glrm_h2o.show()
예제 #55
0
def impute2():
    # Connect to a pre-existing cluster

    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate_missing.csv"))
    methods = ["mean", "median", "mode"]
    combine_methods = ["interpolate", "average", "low", "high"]
    inplace = [False, True]

    for method in methods:
        for combine_method in combine_methods:
            prostate.impute("DPROS",
                            method=method,
                            combine_method=combine_method)


#    air = h2o.upload_file(pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"))
#    for inpl in inplace:
#        for method in methods:
#            for combine_method in combine_methods:
#              air.impute( "TailNum", method = method, combine_method = combine_method)

    data = [[None, 2, 3, 1, 'a', 1, 9], [1, None, 4, 2, 'a', 1, 9],
            [2, 3, None, None, 'b', 1, 9], [3, 4, None, None, 'b', 3, 8],
            [4, 5, 9, 5, None, 2, 8], [5, None, 10, 7, 'b', None, 8]]
    h2o_data = h2o.H2OFrame(zip(*data))

    # mean check
    h2o_data = h2o_data.impute(column="C1", method="mean")
    c1_imputed = h2o_data[0, 0]
    assert c1_imputed == 3, "Wrong value imputed. Expected imputed value of 3, but got {0}".format(
        c1_imputed)

    # inplace check
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data.impute(column="C1", method="mean")
    assert h2o_data["C1"].isna().sum(
    ) == 1, "Expected imputation to be done in place."

    # median-average
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data = h2o_data.impute(column="C2",
                               method="median",
                               combine_method="average")
    c2_imputed = h2o_data[1, 1]
    assert c2_imputed == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(
        c2_imputed)

    # median-low
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data = h2o_data.impute(column="C3",
                               method="median",
                               combine_method="low")
    c3_imputed = h2o_data[2, 2]
    assert c3_imputed == 4, "Wrong value imputed. Expected imputed value of 4, but got {0}".format(
        c3_imputed)

    # median-high
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data = h2o_data.impute(column="C4",
                               method="median",
                               combine_method="high")
    c4_imputed = h2o_data[2, 3]
    assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format(
        c4_imputed)

    # mode-categorical
    h2o_data = h2o.H2OFrame.from_python(zip(*data), na_strings=[''])
    h2o_data = h2o_data.impute(column="C5", method="mode")
    c5_imputed = h2o_data[4, 4]
    assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format(
        c5_imputed)

    # mode-numeric
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data = h2o_data.impute(column="C6", method="mode")
    c6_imputed = h2o_data[5, 5]
    assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format(
        c6_imputed)

    # mean-group by C7
    h2o_data = h2o.H2OFrame(zip(*data))
    h2o_data = h2o_data.impute(column="C3", method="mean", by="C7")
    imputed1 = h2o_data[2, 2]
    imputed2 = h2o_data[3, 2]
    assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format(
        imputed1)
    assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format(
        imputed2)
예제 #56
0
def test_explanation_list_of_models_regression():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    y = "fare"

    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    models = [
        h2o.get_model(m[0])
        for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False,
                                                           header=False)
    ]

    # Test named models as well
    gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)
    models += [gbm]

    # test variable importance heatmap plot
    assert isinstance(
        h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test model correlation heatmap plot
    assert isinstance(
        h2o.model_correlation_heatmap(models, train).figure(),
        matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test partial dependences
    for col in cols_to_test:
        try:
            assert isinstance(
                h2o.pd_multi_plot(models, train, col).figure(),
                matplotlib.pyplot.Figure)
        except ValueError:
            assert col == "name", "'name' is a string column which is not supported."
    matplotlib.pyplot.close("all")

    # test learning curve
    for model in models:
        assert isinstance(model.learning_curve_plot().figure(),
                          matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(h2o.explain(models, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(models, train, 1, render=False),
                      H2OExplanation)
예제 #57
0
def test_explanation_automl_regression():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    train["name"] = train["name"].asfactor()
    y = "fare"

    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.varimp(use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.model_correlation(train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame)


    # test partial dependences
    for col in cols_to_test:
        try:
            assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure)
        except ValueError:
            assert col == "name", "'name' is a string column which is not supported."
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(aml.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation)

    # test shortening model ids work correctly
    from h2o.explanation._explain import _shorten_model_ids
    model_ids = aml.leaderboard.as_data_frame()["model_id"]
    shortened_model_ids = _shorten_model_ids(model_ids)
    assert len(set(model_ids)) == len(set(shortened_model_ids))
    for i in range(len(model_ids)):
        assert len(model_ids[i]) > len(shortened_model_ids[i])

    # Leaderboard slices work
    # test explain
    assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :],
                                  train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :],
                                      train, 1, render=False), H2OExplanation)
예제 #58
0
def test_explanation_automl_binomial_classification():
    train = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.varimp(use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(aml.model_correlation(train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame)


    # test partial dependences
    for col in cols_to_test:
        assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure)
        matplotlib.pyplot.close()

    # test explain
    assert isinstance(aml.explain(train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation)

    # Leaderboard slices work
    # test variable importance heatmap plot
    assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    leaderboard_without_SE = aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :]
    assert len(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=False)) == 3  # numpy.ndarray, colnames, rownames
    assert isinstance(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=True), pandas.DataFrame)

    # test model correlation heatmap plot
    assert isinstance(h2o.model_correlation_heatmap(leaderboard_without_SE, train).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    assert len(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=False)) == 2  # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector
    assert isinstance(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=True), pandas.DataFrame)

    # test partial dependences
    assert isinstance(h2o.pd_multi_plot(leaderboard_without_SE, train, cols_to_test[0]).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test explain
    assert isinstance(h2o.explain(leaderboard_without_SE, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(leaderboard_without_SE, train, 1, render=False), H2OExplanation)
def pca_max_k():
    data = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(data.names) - y)
    pcaGramSVD = H2OPCA(k=-1,
                        transform="STANDARDIZE",
                        pca_method="GramSVD",
                        impute_missing=True,
                        max_iterations=100)
    pcaGramSVD.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaGramSVD._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1)

    correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
    gramSVDNum = len(
        pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
    powerNum = len(
        pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
    assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    # Randomized and GLRM does not have wide dataset implementation.  Check with smaller datasets
    data = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    x = list(set(data.names))
    pcaRandomized = H2OPCA(k=-1,
                           transform="STANDARDIZE",
                           pca_method="Randomized",
                           impute_missing=True,
                           max_iterations=100,
                           seed=12345)
    pcaRandomized.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaRandomizedF = H2OPCA(k=-1,
                            transform="STANDARDIZE",
                            pca_method="Randomized",
                            use_all_factor_levels=True,
                            impute_missing=True,
                            max_iterations=100,
                            seed=12345)
    pcaRandomizedF.train(x, training_frame=data)

    pcaPower = H2OPCA(k=-1,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      impute_missing=True,
                      max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)
    # should still work with rank deficient dataset
    pcaPowerF = H2OPCA(k=-1,
                       transform="STANDARDIZE",
                       pca_method="Power",
                       use_all_factor_levels=True,
                       impute_missing=True,
                       max_iterations=100,
                       seed=12345)
    pcaPowerF.train(x, training_frame=data)

    # eigenvalues between the PCA and Randomize should be close, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomized._model_json["output"]["importance"],
        pcaPower._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope...
    print(
        "@@@@@@  Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        pcaRandomizedF._model_json["output"]["importance"],
        pcaPowerF._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ])

    pcaGLRM = H2OPCA(k=-1,
                     transform="STANDARDIZE",
                     pca_method="GLRM",
                     use_all_factor_levels=True,
                     max_iterations=100,
                     seed=12345)
    pcaGLRM.train(x, training_frame=data)
    correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
    glrmNum = len(
        pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                     ", actual: " + glrmNum + "."
def cars_checkpoint(ip,port):

    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print "\n*** Description (chunk distribution, etc) of training frame:"
    train.describe()
    print "\n*** Description (chunk distribution, etc) of validation frame:"
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and response column, along with the correct
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else              :
        response_col = "economy"

    print "\n*** Response column: {0}".format(response_col)

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "\n*** Building model 1 with the following parameters:"
    print "*** ntrees model 1: {0}".format(ntrees1)
    print "*** max_depth model 1: {0}".format(max_depth1)
    print "*** min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "\n*** Continuing to build model 1 (now called model 2) with the following parameters:"
    print "*** ntrees model 2: {0}".format(ntrees2)
    print "*** max_depth model 2: {0}".format(max_depth2)
    print "*** min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id,
                     seed=1234)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print "\n*** Continuing to build model 1 (now called model 3) with the following parameters:"
    print "*** ntrees model 3: {0}".format(ntrees3)
    print "*** max_depth model 3: {0}".format(max_depth3)
    print "*** min_rows model 3: {0}".format(min_rows3)
    model3 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees3,
                     max_depth=max_depth3,
                     min_rows=min_rows3,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=restored_model._id,
                     seed=1234)

    # build the equivalent of model 2 in one shot
    print "\n*** Building the equivalent of model 2 (called model 4) in one shot:"
    model4 = h2o.random_forest(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     seed=1234)

    print "\n*** Model Summary for model 2:"
    print model2.summary()
    print "\n*** Model Summary for model 3:"
    print model3.summary()
    print "\n*** Model Summary for model 4:"
    print model4.summary()

    print "\n*** Score History for model 2:"
    print model2.score_history()
    print "\n*** Score History for model 3:"
    print model3.score_history()
    print "\n*** Score History for model 4:"
    print model4.score_history()

    # checks
    if problem == 0:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2,type(model4))
        assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))