예제 #1
0
def checkpoint_new_category_in_predictor():

  sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))
  print("checkpoint_new_category_in_predictor-1")
  m1 = H2ODeepLearningEstimator(epochs=100)
  m1.train(x=[0,1,2,4], y=3, training_frame=sv1)

  m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
  m2.train(x=[0,1,2,4], y=3, training_frame=sv2)
  print("checkpoint_new_category_in_predictor-2")

  # attempt to continue building model, but with an expanded categorical predictor domain.
  # this should fail
  try:
    m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
    m3.train(x=[0,1,2,4], y=3, training_frame=vir)
    assert False, "Expected continued model-building to fail with new categories introduced in predictor"
  except EnvironmentError:
    pass
  
  print("checkpoint_new_category_in_predictor-3")

  # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
  predictions = m2.predict(vir)
  print("checkpoint_new_category_in_predictor-4")
def weights_and_distributions():

  htable  = h2o.upload_file(pyunit_utils.locate("smalldata/gbm_test/moppe.csv"))
  htable["premiekl"] = htable["premiekl"].asfactor()
  htable["moptva"] = htable["moptva"].asfactor()
  htable["zon"] = htable["zon"]


  # gamma
  dl = H2ODeepLearningEstimator(distribution="gamma")
  dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad")
  predictions = dl.predict(htable)

  # gaussian
  dl = H2ODeepLearningEstimator(distribution="gaussian")
  dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad")
  predictions = dl.predict(htable)

  # poisson
  dl = H2ODeepLearningEstimator(distribution="poisson")
  dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad")
  predictions = dl.predict(htable)

  # tweedie
  dl = H2ODeepLearningEstimator(distribution="tweedie")
  dl.train(x=range(3),y="medskad",training_frame=htable, weights_column="antskad")
  predictions = dl.predict(htable)
예제 #3
0
def deep_learning(xval=None, sample_size=None, nfolds=None, hparams=None, for_stacking=None):
    """
    create a deep learning algorithm estimator
    :param xval: if for cross-validation
    :param sample_size: training set sample amount
    :param nfolds: k value for k-fold cross-validation
    :param hparams: hyper parameters for grid search
    :param for_stacking: if it is used for stacking
    :return: a constructed deep learning estimator, a parameters' dict for grid search
    """

    if sample_size <= 10000:
        default_nfolds = 3
        hidden_opts = [[30, 30], [20, 20], [10, 10]]
        input_dropout_ratio_opts = [0, 0.05, 0.1]
        l1_opts = [0, 1e-4, 1e-6]
        l2_opts = [0, 1e-4, 1e-6]

    elif 10000 < sample_size <= 100000:
        default_nfolds = 3
        hidden_opts = [[20, 20], [30, 30]]
        input_dropout_ratio_opts = [0, 0.05]
        l1_opts = [0, 1e-6]
        l2_opts = [0, 1e-6]

    else:
        if sample_size > 500000:
            default_nfolds = 1
        else:
            default_nfolds = 2
        hidden_opts = [[20, 20], [10, 10]]
        input_dropout_ratio_opts = [0, 0.05]
        l1_opts = [1e-6]
        l2_opts = [1e-6]

    default_hparams = dict({'hidden': hidden_opts,
                            'input_dropout_ratio': input_dropout_ratio_opts,
                            'l1': l1_opts,
                            'l2': l2_opts})

    if nfolds is None:
        nfolds = default_nfolds
    if hparams is None:
        hparams = default_hparams

    if xval:
        if for_stacking:
            dl_estimator = H2ODeepLearningEstimator(nfolds=nfolds, fold_assignment="Modulo",
                                                    seed=1, keep_cross_validation_predictions=True,
                                                    shuffle_training_data=True)
        else:
            dl_estimator = H2ODeepLearningEstimator(nfolds=nfolds, shuffle_training_data=True)
    else:
        dl_estimator = H2ODeepLearningEstimator(shuffle_training_data=True)

    return dl_estimator, hparams
예제 #4
0
def offsets_and_distributions():

    # cars
    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.set_name(0, "x1")
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    from h2o.estimators.deeplearning import H2ODeepLearningEstimator

    # gamma
    dl = H2ODeepLearningEstimator(distribution="gamma")
    dl.train(x=range(3),
             y="Claims",
             training_frame=insurance,
             offset_column="offset")
    predictions = dl.predict(insurance)

    # gaussian
    dl = H2ODeepLearningEstimator(distribution="gaussian")
    dl.train(x=range(3),
             y="Claims",
             training_frame=insurance,
             offset_column="offset")
    predictions = dl.predict(insurance)

    # poisson
    dl = H2ODeepLearningEstimator(distribution="poisson")
    dl.train(x=range(3),
             y="Claims",
             training_frame=insurance,
             offset_column="offset")
    predictions = dl.predict(insurance)

    # tweedie
    dl = H2ODeepLearningEstimator(distribution="tweedie")
    dl.train(x=range(3),
             y="Claims",
             training_frame=insurance,
             offset_column="offset")
    predictions = dl.predict(insurance)
def pubdev_2041():
    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    s = iris.runif(seed=12345)
    train1 = iris[s >= 0.5]
    train2 = iris[s < 0.5]

    m1 = H2ODeepLearningEstimator(epochs=100)
    m1.train(x=list(range(4)), y=4, training_frame=train1)

    # update m1 with new training data
    m2 = H2ODeepLearningEstimator(checkpoint=m1.model_id, epochs=200)
    m2.train(x=list(range(4)), y=4, training_frame=train2)
예제 #6
0
def imbalance():
    print "Test checks if Deep Learning works fine with an imbalanced dataset"

    covtype = h2o.upload_file(
        pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()

    from h2o.estimators.deeplearning import H2ODeepLearningEstimator

    hh_imbalanced = H2ODeepLearningEstimator(l1=1e-5,
                                             activation="Rectifier",
                                             loss="CrossEntropy",
                                             hidden=[200, 200],
                                             epochs=1,
                                             balance_classes=False,
                                             reproducible=True,
                                             seed=1234)
    hh_imbalanced.train(x=range(54), y=54, training_frame=covtype)
    print hh_imbalanced

    hh_balanced = H2ODeepLearningEstimator(l1=1e-5,
                                           activation="Rectifier",
                                           loss="CrossEntropy",
                                           hidden=[200, 200],
                                           epochs=1,
                                           balance_classes=True,
                                           reproducible=True,
                                           seed=1234)
    hh_balanced.train(x=range(54), y=54, training_frame=covtype)
    print hh_balanced

    #compare overall logloss
    class_6_err_imbalanced = hh_imbalanced.logloss()
    class_6_err_balanced = hh_balanced.logloss()

    if class_6_err_imbalanced < class_6_err_balanced:
        print "--------------------"
        print ""
        print "FAIL, balanced error greater than imbalanced error"
        print ""
        print ""
        print "class_6_err_imbalanced"
        print class_6_err_imbalanced
        print ""
        print "class_6_err_balanced"
        print class_6_err_balanced
        print ""
        print "--------------------"

    assert class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!"
예제 #7
0
def deeplearning_demo():
    # Training data
    train_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop('Site')
    train_data['Angaus'] = train_data['Angaus'].asfactor()
    print train_data.describe()
    train_data.head()

    # Testing data
    test_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data['Angaus'] = test_data['Angaus'].asfactor()
    print test_data.describe()
    test_data.head()

    # Run DeepLearning

    dl = H2ODeepLearningEstimator(loss="CrossEntropy",
                                  epochs=1000,
                                  hidden=[20, 20, 20])
    dl.train(x=range(1, train_data.ncol),
             y="Angaus",
             training_frame=train_data,
             validation_frame=test_data)
    dl.show()
def generate_dataset(family, nrow, ncol, networkStructure, activation, realFrac, intFrac, enumFrac, missingFrac, 
                     factorRange, numericRange, targetFactor):
    if family=="bernoulli":
        responseFactor = 2
    elif family == 'gaussian':
        responseFactor = 1;
    else :
        responseFactor = targetFactor
        
    trainData = random_dataset(nrow, ncol, realFrac=realFrac, intFrac=intFrac, enumFrac=enumFrac, factorR=factorRange,
                               integerR=numericRange, responseFactor=responseFactor, misFrac=missingFrac)
   
    myX = trainData.names
    myY = 'response'
    myX.remove(myY)
    m = H2ODeepLearningEstimator(distribution = family, hidden=networkStructure, activation=activation, epochs=0,
                                 initial_weight_distribution='normal')
    m.train(training_frame=trainData,x=myX,y= myY)
    f2 = m.predict(trainData)
    
    finalDataset = trainData[myX]
    finalDataset = finalDataset.cbind(f2[0])
    finalDataset.set_name(col=finalDataset.ncols-1, name='response')

    return finalDataset
예제 #9
0
def test_dl():
    train = import_iris2()
    ae_model = H2OAutoEncoderEstimator(
        activation="Tanh", hidden=[40, 80],
        model_id="ae_model", epochs=1,
        ignore_const_cols=False
    )
    ae_model.train(list(range(4)), training_frame=train)
    dl1 = H2ODeepLearningEstimator(hidden=[10, 10], export_weights_and_biases=True)
    dl1.train(x=list(range(4)), y=4, training_frame=train)
    w1 = dl1.weights(0)
    w3 = dl1.weights(2)
    b1 = dl1.biases(0)
    b2 = dl1.biases(1)
    params = {
        "initial_weights": [w1, None, w3], "initial_biases": [b1, b2, None],
        "pretrained_autoencoder": "ae_model",
        "hidden": [40, 80], "ignore_const_cols": False
    }
    hyper_params = {
        "epochs": [2, 4, 6, 10, 20, 50],
        "rate": [.005, .006, .007]
    }
    grid_ft_resume(
        train, "DEEP_LEARNING", params, hyper_params, dl_start, dl_resume
    )
def deeplearning_basic():

    iris_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    hh = H2ODeepLearningEstimator(loss="CrossEntropy")
    hh.train(x=list(range(3)), y=4, training_frame=iris_hex)
    hh.show()
예제 #11
0
def tweedie_weights():

  data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/cancar_logIn.csv"))
  data["C1M3"] = ((data["Class"] == 1) & (data["Merit"] == 3)).asfactor()
  data["C3M3"] = ((data["Class"] == 3) & (data["Merit"] == 3)).asfactor()
  data["C4M3"] = ((data["Class"] == 4) & (data["Merit"] == 3)).asfactor()
  data["C1M2"] = ((data["Class"] == 1) & (data["Merit"] == 2)).asfactor()
  data["Merit"] = data["Merit"].asfactor()
  data["Class"] = data["Class"].asfactor()
  loss = old_div(data["Cost"], data["Insured"])
  loss.set_name(0,"Loss")
  cancar = loss.cbind(data)

  # Without weights
  myX = ["Merit","Class","C1M3","C4M3"]


  dl = H2ODeepLearningEstimator(distribution="tweedie",hidden=[1],epochs=1000,
                                train_samples_per_iteration=-1,reproducible=True,
                                activation="Tanh",balance_classes=False,
                                force_load_balance=False, seed=2353123,
                                tweedie_power=1.5,score_training_samples=0,
                                score_validation_samples=0)

  dl.train(x=myX,y="Loss", training_frame=cancar)

  mean_residual_deviance = dl.mean_residual_deviance()

  # With weights
  dl.train(x=myX, y="Loss", training_frame=cancar, weights_column="Insured")
예제 #12
0
def deep_learning_metrics_test():
    # connect to existing cluster

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    df.drop("ID")  # remove ID
    df['CAPSULE'] = df['CAPSULE'].asfactor()  # make CAPSULE categorical
    vol = df['VOL']
    vol[vol == 0] = float("nan")  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    train.tail()
    test.describe()
    test.head()
    test.tail()

    # Run DeepLearning
    print("Train a Deeplearning model: ")

    dl = H2ODeepLearningEstimator(epochs=100,
                                  hidden=[10, 10, 10],
                                  loss="CrossEntropy")
    dl.train(x=list(range(2, train.ncol)), y="CAPSULE", training_frame=train)
    print("Binomial Model Metrics: ")
    print()
    dl.show()
    p = dl.model_performance(test)
    p.show()
예제 #13
0
def pubdev_2223():
    covtype = h2o.import_file(
        pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    dlmodel = H2ODeepLearningEstimator(hidden=[17, 191],
                                       epochs=1,
                                       balance_classes=False,
                                       reproducible=True,
                                       seed=1234,
                                       export_weights_and_biases=True)
    dlmodel.train(x=list(range(54)), y=54, training_frame=covtype)

    print(
        "Normalization/Standardization multipliers for numeric predictors: {0}\n"
        .format(dlmodel.normmul()))
    print(
        "Normalization/Standardization offsets for numeric predictors: {0}\n".
        format(dlmodel.normsub()))
    print(
        "Normalization/Standardization multipliers for numeric response: {0}\n"
        .format(dlmodel.respmul()))
    print("Normalization/Standardization offsets for numeric response: {0}\n".
          format(dlmodel.respsub()))
    print("Categorical offsets for one-hot encoding: {0}\n".format(
        dlmodel.catoffsets()))
예제 #14
0
def dl_demo():
    from h2o.estimators.deeplearning import H2ODeepLearningEstimator
    df[1] = df[1].asfactor()

    # 随机统一数字,每行一个
    random = df[0].runif()

    # 60%的训练集
    train = df[random < 0.6]

    # 30%的验证集
    valid = df[0.6 <= random < 0.9]

    # 10%的测试集
    test = df[random >= 0.9]

    m = H2ODeepLearningEstimator()

    print(
        'm.train_print:',
        m.train(x=train.names[2:],
                y=train.names[1],
                training_frame=train,
                validation_frame=valid))
    print('m.train_print_end')

    print('m_print:', m)

    # 预测
    print('m.predict_print:\n', m.predict(test))
    print('m.predict_print_end')

    # 在训练数据上显示性能
    m.model_performance()

    # 在验证数据上显示性能
    m.model_performance(valid=True)

    # 评分并计算测试数据的新指标!
    print('m.model_performance(test_data=test)_print:',
          m.model_performance(test_data=test))
    print('m.model_performance(test_data=test)_print_end')

    # 训练数据的均方差
    m.mse()

    # 验证集上的均方差
    print('m.mse_print:', m.mse(valid=True))

    m.r2()
    print('m.r2_print:', m.r2(valid=True))

    print('m.confusion_matrix_print:', m.confusion_matrix())

    # 混淆矩阵的最大精度
    m.confusion_matrix(metrics="accuracy")

    # check out the help for more!
    m.confusion_matrix("min_per_class_accuracy")
예제 #15
0
def hyperopt_train_test(params):
    dl = H2ODeepLearningEstimator(**params)
    if 'hidden' in params.keys():
        dl.hidden = list(params['hidden'])
    if 'hidden_dropout_ratios' in params.keys():
        dl.hidden_dropout_ratios = list(params['hidden_dropout_ratios'])
    dl.train(x=X_vars, y=y_var, training_frame=t, validation_frame=v)
    return dl.model_performance(v).logloss()
예제 #16
0
def deep_learning(name):
    """
    Get the Deep Learning Model
    :param name: model name, will determine filename
    :return:
    """
    params = get_params("deep_learning")
    return H2ODeepLearningEstimator(model_id=name, **params)
def algo_pr_auc_test():
    '''
    This pyunit test is written to make sure we can call pr_auc() on all binomial models.
    '''

    seed = 123456789
    prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10,
                                           distribution="bernoulli", seed=seed)
    gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GBM model")
    print(gbm_h2o)
    print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc()))

    # Build H2O GLM classification model:
    glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed)
    glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GLM model")
    print(glm_h2o)  # glm scoring history does not contain AUC, and hence no pr_auc
    print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc()))
    
    rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0)
    rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing random forest model")
    print(rf_h2o)
    print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc()))

    dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2])
    dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing deeplearning model")
    print(dl_h2o)
    print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc()))

    assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    # try to call pr_auc() for regression.  Should encounter error.
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    try:
        print(h2o_model.pr_auc())
        assert 1==2, "pr_auc() should raise an error for multinomial but did not."
    except:
        pass
def deeplearning_multi():
  print("Test checks if Deep Learning works fine with a multiclass training and test dataset")

  prostate = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))

  prostate[4] = prostate[4].asfactor()

  hh = H2ODeepLearningEstimator(loss="CrossEntropy")
  hh.train(x=[0,1],y=4, training_frame=prostate, validation_frame=prostate)
  hh.show()
def weights_and_biases():


  print "Test checks if Deep Learning weights and biases are accessible from R"

  covtype = h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
  covtype[54] = covtype[54].asfactor()

  from h2o.estimators.deeplearning import H2ODeepLearningEstimator
  dlmodel = H2ODeepLearningEstimator(hidden=[17,191],
                                     epochs=1,
                                     balance_classes=False,
                                     reproducible=True,
                                     seed=1234,
                                     export_weights_and_biases=True)
  dlmodel.train(x=range(54),y=54,training_frame=covtype)
  print dlmodel

  weights1 = dlmodel.weights(0)
  weights2 = dlmodel.weights(1)
  weights3 = dlmodel.weights(2)

  biases1 = dlmodel.biases(0)
  biases2 = dlmodel.biases(1)
  biases3 = dlmodel.biases(2)

  w1c = weights1.ncol
  w1r = weights1.nrow
  assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format(52, w1c)
  assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w1r)

  w2c = weights2.ncol
  w2r = weights2.nrow
  assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, w2c)
  assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w2r)

  w3c = weights3.ncol
  w3r = weights3.nrow
  assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, w3c)
  assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, w3r)

  b1c = biases1.ncol
  b1r = biases1.nrow
  assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b1c)
  assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format(17, b1r)

  b2c = biases2.ncol
  b2r = biases2.nrow
  assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b2c)
  assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format(191, b2r)

  b3c = biases3.ncol
  b3r = biases3.nrow
  assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format(1, b3c)
  assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format(7, b3r)
예제 #20
0
def deep_1(
        K, dfs, dfs_collector, test,
        test_collector
):
    r = 'deep_1'

    features = on_top2
    val_hf = h2o.H2OFrame(test)
    ntrees = 100
    seed = 1155
    v = np.zeros(shape=[len(test)])
    for i in range(K):
        print()
        print('in model:', r, ' k-fold:', i + 1, '/', K)
        print()
        b = [i for i in range(K)]
        b.remove(i)
        c = [dfs[b[j]] for j in range(K - 1)]
        dt = pd.concat(c)
        train_hf = h2o.H2OFrame(dt)
        del dt
        dfs_i = h2o.H2OFrame(dfs[i])

        # features = list(train_hf.columns)
        features.remove('target')
        print('- ' * 10)
        for c in features:
            print("'{}',".format(c))
        print('- ' * 10)
        model = H2ODeepLearningEstimator(hidden=[200,200], epochs=500)
        model.train(x=features,
                         y='target',
                         training_frame=train_hf)
        del train_hf
        p = model.predict(dfs_i)
        dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values
        print(dfs_collector[i].head())
        print(dfs_collector[i].head().dtypes)
        q = model.predict(val_hf)

        dd = h2o.as_list(q, use_pandas=True)
        a = dd['predict']
        a = np.array(a, dtype=pd.Series).tolist()
        # print(type(a))
        # print(a.shape)
        v += a
        print('# ' * 10)
        for show_v in range(5):
            print(v[show_v])
        print('# ' * 10)

    test_collector[r] = v / K
    print(test_collector.head())
    return dfs_collector, test_collector, r
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    sv = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    m1 = H2ODeepLearningEstimator(epochs=100)
    m1.train(x=[0, 1, 2, 3], y=4, training_frame=sv)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = H2ODeepLearningEstimator(checkpoint=m1.model_id, epochs=200)
        m2.train(x=[0, 1, 2, 3], y=4, training_frame=iris)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
예제 #22
0
def deeplearning_no_hidden():
    iris_hex = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    hh = H2ODeepLearningEstimator(hidden=[],
                                  loss="CrossEntropy",
                                  export_weights_and_biases=True)
    hh.train(x=list(range(4)), y=4, training_frame=iris_hex)
    hh.show()
    weights1 = hh.weights(0)
    assert weights1.shape[0] == 3
    assert weights1.shape[1] == 4
예제 #23
0
def varimp_plot_test():
  kwargs = {}
  kwargs['server'] = True
  
  # import data set
  cars = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
  
  # Constructing validation and train sets by sampling (20/80)
  s = cars[0].runif()
  cars_train = cars[s <= 0.8]
  cars_valid = cars[s > 0.8]

  # set list of features, target, and convert target to factor
  predictors = ["displacement", "power", "weight", "acceleration", "year"]
  response_col = "economy_20mpg"
  cars[response_col] = cars[response_col].asfactor()

  # Build and train a DRF model
  # to do: comment this out
  cars_rf = H2ORandomForestEstimator()
  cars_rf.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  #Plot DRF Variable Importances, check that num_of_features accepts input
  cars_rf.varimp_plot(server=True)
  cars_rf.varimp_plot(num_of_features=2, server=True)

  # test saving:
  tmpdir = tempfile.mkdtemp(prefix="h2o-func")
  path="{}/plot1.png".format(tmpdir)
  test_plot_result_saving(cars_rf.varimp_plot(server=True), "{}/plot2.png".format(tmpdir), cars_rf.varimp_plot(server=True, save_plot_path=path), path)

  # Build and train a GBM model
  cars_gbm = H2OGradientBoostingEstimator()
  cars_gbm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  # Plot GBM Variable Importances
  cars_gbm.varimp_plot(server=True)
  cars_gbm.varimp_plot(num_of_features=2, server=True)

  # Build and train a Deep Learning model
  cars_dl = H2ODeepLearningEstimator(variable_importances=True)
  cars_dl.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  # Plot Deep Learning Variable Importances
  cars_dl.varimp_plot(server=True)
  cars_dl.varimp_plot(num_of_features=2, server=True)

  # check that varimp_plot() uses std_coef_plot() for a glm
  cars_glm = H2OGeneralizedLinearEstimator()
  cars_glm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)
  cars_glm.varimp_plot(server=True)
  cars_glm.varimp_plot(num_of_features=2, server=True)
def tweedie_offset():

    insurance = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()
    insurance["Group"] = insurance["Group"].asfactor()
    insurance["Age"] = insurance["Age"].asfactor()
    insurance["District"] = insurance["District"].asfactor()

    from h2o.estimators.deeplearning import H2ODeepLearningEstimator

    # without offset
    dl = H2ODeepLearningEstimator(distribution="tweedie",
                                  hidden=[1],
                                  epochs=1000,
                                  train_samples_per_iteration=-1,
                                  reproducible=True,
                                  activation="Tanh",
                                  single_node_mode=False,
                                  balance_classes=False,
                                  force_load_balance=False,
                                  seed=23123,
                                  tweedie_power=1.5,
                                  score_training_samples=0,
                                  score_validation_samples=0)
    dl.train(x=range(3), y="Claims", training_frame=insurance)
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.556 - mean_residual_deviance) < 1e-3, "Expected mean residual deviance to be 0.556, but got " \
                                                       "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(47.61-predictions[0].mean()) < 1e-2, "Expected mean of predictions to be 47.61, but got " \
                                                    "{0}".format(predictions[0].mean())
    assert abs(1.94-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.94, but got " \
                                                  "{0}".format(predictions[0].min())
    assert abs(284.6-predictions[0].max()) < 28, "Expected max of predictions to be 284.6, but got " \
                                                 "{0}".format(predictions[0].max())

    # with offset
    dl.train(x=range(3),
             y="Claims",
             training_frame=insurance,
             offset_column="offset")
    mean_residual_deviance = dl.mean_residual_deviance()
    assert abs(0.261-mean_residual_deviance) < 1e-2, "Expected mean residual deviance to be 0.261, but got " \
                                                     "{0}".format(mean_residual_deviance)
    predictions = dl.predict(insurance)
    assert abs(49.53-predictions[0].mean()) < 1e-1, "Expected mean of predictions to be 49.53, but got " \
                                                    "{0}".format(predictions[0].mean())
    assert abs(1.074-predictions[0].min()) < 1e-1, "Expected min of predictions to be 1.074, but got " \
                                                   "{0}".format(predictions[0].min())
    assert abs(397.3-predictions[0].max()) < 40, "Expected max of predictions to be 397.3, but got " \
                                                 "{0}".format(predictions[0].max())
예제 #25
0
def missing():
    # Connect to a pre-existing cluster

    missing_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.99]
    errors = [0, 0, 0, 0, 0, 0]

    for i in range(len(missing_ratios)):
        data = h2o.upload_file(
            pyunit_utils.locate("smalldata/junit/weather.csv"))
        data[15] = data[15].asfactor()  #ChangeTempDir
        data[16] = data[16].asfactor()  #ChangeTempMag
        data[17] = data[17].asfactor()  #ChangeWindDirect
        data[18] = data[18].asfactor()  #MaxWindPeriod
        data[19] = data[19].asfactor()  #RainToday
        data[21] = data[21].asfactor()  #PressureChange
        data[23] = data[23].asfactor()  #RainTomorrow

        print "For missing {0}%".format(missing_ratios[i] * 100)

        # add missing values to the data section of the file (leave the response alone)
        if missing_ratios[i] > 0:
            resp = data[23]
            pred = data[:, range(23) + range(24, data.ncol)]
            data_missing = pred.insert_missing_values(
                fraction=missing_ratios[i])
            data_fin = data_missing.cbind(resp)
        else:
            data_fin = data

        # split into train + test datasets
        ratio = data_fin[0].runif()
        train = data_fin[ratio <= .75]
        test = data_fin[ratio > .75]

        from h2o.estimators.deeplearning import H2ODeepLearningEstimator
        hh = H2ODeepLearningEstimator(epochs=5,
                                      reproducible=True,
                                      seed=12345,
                                      activation='RectifierWithDropout',
                                      l1=1e-5,
                                      input_dropout_ratio=0.)
        hh.train(x=range(2, 22),
                 y=23,
                 training_frame=train,
                 validation_frame=test)
        errors[i] = hh.error()[0][1]

    for i in range(len(missing_ratios)):
        print "missing ratio: {0}% --> classification error: {1}".format(
            missing_ratios[i] * 100, errors[i])

    assert sum(errors) < 2.2, "Sum of classification errors is too large!"
def _get_mlp_model(predictor_col, response_col, train_f, val_f):
    from h2o.estimators.deeplearning import H2ODeepLearningEstimator
    mlp_model = H2ODeepLearningEstimator(activation='tanh',
                                         adaptive_rate=False,
                                         nesterov_accelerated_gradient=False,
                                         hidden=[10, 10],
                                         seed=123,
                                         epochs=10)
    mlp_model.train(x=predictor_col,
                    y=response_col,
                    training_frame=train_f,
                    validation_frame=val_f)
    return mlp_model
def main():

    # Generate dataset for y = x^2
    df = sine_df(glob_train_periods, glob_density)

    # Start h2o
    h2o.init(ip='192.168.0.41', port=65432, max_mem_size_GB=128)

    # Create H2OFrame
    column_types = ['real', 'real']
    hf = h2o.H2OFrame(df, column_types=column_types)
    train, val = hf.split_frame(ratios=[0.8])

    # Create model
    predictors = 'x'
    response = 'y'
    model = H2ODeepLearningEstimator(
        model_id='dnn_sine',
        epochs=5000,
        hidden=[800],
        activation='rectifier',
        # hidden_dropout_ratios=[0.0],
        l1=1e-4,
        l2=1e-4,
        max_w2=0.55,
        stopping_rounds=8,
        # stopping_tolerance=1e-4,
        stopping_metric='rmse',

        # Control scoring epochs
        score_interval=0,
        score_duty_cycle=1,
        shuffle_training_data=False,
        replicate_training_data=True,
        train_samples_per_iteration=int(0.5 * len(df) / 1.258),
    )
    model.train(x=predictors,
                y=response,
                training_frame=train,
                validation_frame=val)

    # Create test set with domain outside training
    test_df = sine_df(glob_test_periods, glob_density)
    test = h2o.H2OFrame(test_df, column_types=column_types)
    test_df['predict'] = model.predict(test).as_data_frame()

    # Plot results
    plt.plot(test_df['x'], test_df['y'])
    plt.plot(test_df['x'], test_df['predict'])
    plt.xlim(-glob_test_periods, glob_test_periods)
    plt.show()
def offset_init_train_deeplearning():
    # Connect to a pre-existing cluster
    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    offset = h2o.H2OFrame([[.5]] * 398)
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    # offset_column passed in the train method
    dl_train = H2ODeepLearningEstimator(hidden=[20, 20], epochs=10)
    dl_train.train(x=list(range(2, 8)),
                   y="economy_20mpg",
                   training_frame=cars,
                   offset_column="x1")
    predictions_train = dl_train.predict(cars)

    # test offset_column passed in estimator init
    dl_init = H2ODeepLearningEstimator(hidden=[20, 20],
                                       epochs=10,
                                       offset_column="x1")
    dl_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars)
    predictions_init = dl_init.predict(cars)

    # case the both offset column parameters are set and only the parameter in train will be used
    dl_init_train = H2ODeepLearningEstimator(hidden=[20, 20],
                                             epochs=10,
                                             offset_column="x1")
    dl_init_train.train(x=list(range(2, 8)),
                        y="economy_20mpg",
                        training_frame=cars,
                        offset_column="x1")
    predictions_init_train = dl_init_train.predict(cars)

    assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor."
    assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."
예제 #29
0
def deeplearning_multi():

  print("Test checks if Deep Learning works fine with a categorical dataset")
  # print(locate("smalldata/logreg/protstate.csv"))
  prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  prostate[1] = prostate[1].asfactor()  #CAPSULE -> CAPSULE
  prostate[2] = prostate[2].asfactor()  #AGE -> Factor
  prostate[3] = prostate[3].asfactor()  #RACE -> Factor
  prostate[4] = prostate[4].asfactor()  #DPROS -> Factor
  prostate[5] = prostate[5].asfactor()  #DCAPS -> Factor
  prostate = prostate.drop('ID')        #remove ID
  prostate.describe()

  hh = H2ODeepLearningEstimator(loss="CrossEntropy",
                                hidden=[10,10],
                                use_all_factor_levels=False)
  hh.train(x=list(set(prostate.names) - {"CAPSULE"}), y="CAPSULE", training_frame=prostate)
  hh.show()
def deeplearning_export():
    print("###### DEEPLEARNING ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    problem = random.sample(list(range(3)), 1)[0]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        frame[response_col] = frame[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        frame[response_col] = frame[response_col].asfactor()
    else:
        response_col = "economy"
    print("Response column: {0}".format(response_col))
    model = H2ODeepLearningEstimator(nfolds=random.randint(3, 10), fold_assignment="Modulo", hidden=[20, 20], epochs=10)
    model.train(x=predictors, y=response_col, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)