def link_incompatible_error(ip, port):

    print("Reading in original prostate data.")
    prostate = h2o.import_file(
        path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

    print(
        "Throw error when trying to create model with incompatible logit link."
    )
    try:
        h2o.model = h2o.glm(x=prostate[1:8],
                            y=prostate[8],
                            family="gaussian",
                            link="logit")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[1:8],
                            y=prostate[8],
                            family="tweedie",
                            link="log")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[2:9],
                            y=prostate[1],
                            family="binomial",
                            link="inverse")
        assert False, "expected an error"
    except EnvironmentError:
        assert True
def link_incompatible_error():
    
    


    print("Reading in original prostate data.")
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip"))

    print("Throw error when trying to create model with incompatible logit link.")
    try:
        h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="gaussian", link="logit")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="tweedie", link="log")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", link="inverse")
        assert False, "expected an error"
    except EnvironmentError:
        assert True
def covtype_get_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = range(0,20) + range(29,54)

    # Set response to be indicator of a particular class
    res_class = random.randint(1,4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
Пример #4
0
def getLambdaModel():
	
	

	print("Read data")
	prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))

	myX = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
	myY = "CAPSULE"
	family = random.choice(["gaussian","binomial"])
	print(family)

	print("Do lambda search and build models")
	if family == "gaussian":
		model = h2o.glm(x=prostate[myX], y=prostate[myY], family=family, standardize=True, use_all_factor_levels=True, lambda_search=True)
	else:
		model = h2o.glm(x=prostate[myX], y=prostate[myY].asfactor(), family=family, standardize=True, use_all_factor_levels=True, lambda_search=True)

	print("the models were built over the following lambda values: ")
	all_lambdas = model.models(1).lambda_all()
	print(all_lambdas)

	for i in range(10):
		Lambda = random.sample(all_lambdas,1)
		print("For Lambda we get this model:")
		m1 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda))
		m1.show()
		print("this model should be same as the one above:")
		m2 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda))
		m2.show()
		assert m1==m2, "expected models to be equal"
Пример #5
0
def link_functions_gamma(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 5]
    sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]]

    print("Testing for family: GAMMA")
    print("Set variables for h2o.")
    myY = "DPROS"
    myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"]

    print("Create models with canonical link: INVERSE")
    h2o_model_in = h2o.glm(x=h2o_data[myX],
                           y=h2o_data[myY],
                           family="gamma",
                           link="inverse",
                           alpha=[0.5],
                           Lambda=[0],
                           n_folds=0)
    sm_model_in = sm.GLM(endog=sm_data_response,
                         exog=sm_data_features,
                         family=sm.families.Gamma(
                             sm.families.links.inverse_power)).fit()

    print("Compare model deviances for link function inverse")
    h2o_deviance_in = h2o_model_in._model_json['output'][
        'residual_deviance'] / h2o_model_in._model_json['output'][
            'null_deviance']
    sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance
    assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create models with canonical link: LOG")
    h2o_model_log = h2o.glm(x=h2o_data[myX],
                            y=h2o_data[myY],
                            family="gamma",
                            link="log",
                            alpha=[0.5],
                            Lambda=[0],
                            n_folds=0)
    sm_model_log = sm.GLM(endog=sm_data_response,
                          exog=sm_data_features,
                          family=sm.families.Gamma(
                              sm.families.links.log)).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = h2o_model_log._model_json['output'][
        'residual_deviance'] / h2o_model_log._model_json['output'][
            'null_deviance']
    sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gamma(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,5]
	sm_data_features = sm_data[:,[1,2,3,4,6,7,8,9]]

	print("Testing for family: GAMMA")
	print("Set variables for h2o.")
	myY = "DPROS"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","CAPSULE"]

	print("Create models with canonical link: INVERSE")
	h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse",alpha=[0.5], Lambda=[0])
	sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.inverse_power)).fit()

	print("Compare model deviances for link function inverse")
	h2o_deviance_in = h2o_model_in._model_json['output']['residual_deviance'] / h2o_model_in._model_json['output']['null_deviance']
	sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance
	assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures"

	print("Create models with canonical link: LOG")
	h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log",alpha=[0.5], Lambda=[0])
	sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.log)).fit()

	print("Compare model deviances for link function log")
	h2o_deviance_log = h2o_model_log._model_json['output']['residual_deviance'] / h2o_model_log._model_json['output']['null_deviance']
	sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance
	assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def shuffling_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


    print("Create model on original Arcene dataset.")
    h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True)

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True)

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True)

    print("Assert that number of predictors remaining and their respective coefficients are equal.")

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def covtype(ip,port):

  # Connect to h2o
  h2o.init(ip,port)

  # Log.info("Importing covtype.20k.data...\n")
  covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
  #
  myY = 54
  myX = [x for x in range(0,54) if x not in [20,28]]

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
  covtype[54] = (covtype[54] == res_class)

  #covtype.summary()

  # L2: alpha = 0, lambda = 0
  covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0], Lambda=[0])
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0.5], Lambda=[1e-4])
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[1], Lambda=[1e-4])
  covtype_mod3.show()
def getLambdaModel(ip,port):
	
	

	print("Read data")
	prostate = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))

	myX = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
	myY = "CAPSULE"
	family = random.choice(["gaussian","binomial"])
	print(family)

	print("Do lambda search and build models")
	if family == "gaussian":
		model = h2o.glm(x=prostate[myX], y=prostate[myY], family=family, standardize=True, use_all_factor_levels=True, lambda_search=True)
	else:
		model = h2o.glm(x=prostate[myX], y=prostate[myY].asfactor(), family=family, standardize=True, use_all_factor_levels=True, lambda_search=True)

	print("the models were built over the following lambda values: ")
	all_lambdas = model.models(1).lambda_all()
	print(all_lambdas)

	for i in range(10):
		Lambda = random.sample(all_lambdas,1)
		print("For Lambda we get this model:")
		m1 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda))
		m1.show()
		print("this model should be same as the one above:")
		m2 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda))
		m2.show()
		assert m1==m2, "expected models to be equal"
def lambda_search(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing prostate.csv data...\n")
    prostate = h2o.import_frame(h2o.locate("smalldata/logreg/prostate.csv"))
    #prostate.summary()

    # GLM without lambda search, lambda is single user-provided value
    #Log.info("H2O GLM (binomial) with parameters: lambda_search = TRUE, nfolds: 2\n")
    prostate_nosearch = h2o.glm(x=prostate[2:9], y=prostate[1], training_frame = prostate.hex, family = "binomial", nlambdas = 5, lambda_search = False, n_folds = 2)
    params_nosearch = prostate_nosearch.params()

    try:
      prostate_nosearch.getGLMLambdaModel(0.5)
      assert False, "expected an error"
    except EnvironmentError:
      assert True

    # GLM with lambda search, return only model corresponding to best lambda as determined by H2O
    #Log.info("H2O GLM (binomial) with parameters: lambda_search: TRUE, nfolds: 2\n")
    prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], training_frame = prostate.hex, family = "binomial", nlambdas = 5, lambda_search = True, n_folds = 2)
    params_search = prostate_search.params()

    random_lambda = random.choice(prostate_search.lambda_all())
    #Log.info(cat("Retrieving model corresponding to randomly chosen lambda", random_lambda, "\n"))
    random_model = prostate_search.getGLMLambdaModel(random_lambda)
    assert random_model.getLambda() == random_lambda, "expected equal lambdas"

    #Log.info(cat("Retrieving model corresponding to best lambda", params.bestlambda$lambda_best, "\n"))
    best_model = prostate_search.getGLMLambdaModel(params_search.bestlambda())
    assert best_model.model() == prostate_search.model(), "expected models to be equal"
Пример #11
0
def covtype(ip,port):

  
  

  # Log.info("Importing covtype.20k.data...\n")
  covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
  #
  myY = 54
  myX = [x for x in range(0,54) if x not in [20,28]]

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
  covtype[54] = (covtype[54] == res_class)

  #covtype.summary()

  # L2: alpha = 0, lambda = 0
  covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0], Lambda=[0])
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0.5], Lambda=[1e-4])
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[1], Lambda=[1e-4])
  covtype_mod3.show()
def shuffling_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


    print("Create model on original Arcene dataset.")
    h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Assert that number of predictors remaining and their respective coefficients are equal.")

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
Пример #13
0
def covtype_get_model():
    
    

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = list(range(0,20)) + list(range(29,54))

    # Set response to be indicator of a particular class
    res_class = random.randint(1,4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
def link_functions_poisson():

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")
        ).open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: POISSON")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create h2o model with canonical link: LOG")
    h2o_model_log = h2o.glm(x=h2o_data[myX],
                            y=h2o_data[myY],
                            family="poisson",
                            link="log",
                            alpha=[0.5],
                            Lambda=[0])

    print("Create statsmodel model with canonical link: LOG")
    sm_model_log = sm.GLM(endog=sm_data_response,
                          exog=sm_data_features,
                          family=sm.families.Poisson(
                              sm.families.links.log)).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = old_div(h2o_model_log.residual_deviance(),
                               h2o_model_log.null_deviance())
    sm_deviance_log = old_div(sm_model_log.deviance,
                              sm_model_log.null_deviance)
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create h2o models with link: IDENTITY")
    h2o_model_id = h2o.glm(x=h2o_data[myX],
                           y=h2o_data[myY],
                           family="poisson",
                           link="identity",
                           alpha=[0.5],
                           Lambda=[0])

    print("Create statsmodel models with link: IDENTITY")
    sm_model_id = sm.GLM(endog=sm_data_response,
                         exog=sm_data_features,
                         family=sm.families.Poisson(
                             sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance_id = old_div(h2o_model_id.residual_deviance(),
                              h2o_model_id.null_deviance())
    sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance)
    assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
  def attack(family, train, valid, x, y):
    kwargs = {}
    kwargs['family'] = family
    gaussian_links = ["inverse", "log", "identity"]
    binomial_links = ["logit"]
    poisson_links =  ["log", "identity"]
    gamma_links = ["inverse", "log", "identity"]

    # randomly select parameters and their corresponding values
    if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,50)
    if random.random() > 0.8: kwargs['beta_epsilon'] = random.random()
    if random.randint(0,1): kwargs['solver'] = ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE",
                                                "COORDINATE_DESCENT"][random.randint(0,1)]
    if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)]
    if random.randint(0,1):
      if   family == "gaussian": kwargs['link'] = gaussian_links[random.randint(0,2)]
      elif family == "binomial": kwargs['link'] = binomial_links[random.randint(0,0)]
      elif family == "poisson" : kwargs['link'] = poisson_links[random.randint(0,1)]
      elif family == "gamma"   : kwargs['link'] = gamma_links[random.randint(0,2)]
    if random.randint(0,1): kwargs['alpha'] = [random.random()]
    if family == "binomial":
      if random.randint(0,1): kwargs['prior'] = random.random()
    if random.randint(0,1): kwargs['lambda_search'] = [True, False][random.randint(0,1)]
    if 'lambda_search' in kwargs.keys():
      if random.randint(0,1): kwargs['nlambdas'] = random.randint(2,10)
    do_validation = [True, False][random.randint(0,1)]
    # beta constraints
    if random.randint(0,1):
      bc = []
      for n in x:
        if train[n].isnumeric():
          name = train.names[n]
          lower_bound = random.uniform(-1,1)
          upper_bound = lower_bound + random.random()
          bc.append([name, lower_bound, upper_bound])
      if len(bc) > 0:
        beta_constraints = h2o.H2OFrame(bc)
        beta_constraints.set_names(['names', 'lower_bounds', 'upper_bounds'])
        kwargs['beta_constraints'] = beta_constraints.frame_id

    # display the parameters and their corresponding values
    print "-----------------------"
    print "x: {0}".format(x)
    print "y: {0}".format(y)
    print "validation: {0}".format(do_validation)
    for k, v in zip(kwargs.keys(), kwargs.values()):
      if k == 'beta_constraints':
        print k + ": "
        beta_constraints.show()
      else:
        print k + ": {0}".format(v)
    if do_validation:
      m = H2OGeneralizedLinearEstimator(**kwargs)
      h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs)
    else: h2o.glm(x=train[x], y=train[y], **kwargs)
    print "-----------------------"
def offset_1897():

    print "Checking binomial models for GLM with and without offset"
    print "Import prostate dataset into H2O and R..."
    prostate_hex = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv"))

    print "Checking binomial model without offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="binomial",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(379.053509501537)
    assert abs(379.053509501537 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model with offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="binomial",
        offset_column="AGE",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(1515.91815848623)
    assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model without offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="poisson",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(216.339989007507)
    assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model with offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="poisson",
        offset_column="AGE",
        standardize=False,
    )
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(2761.76218461138)
    assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1
Пример #17
0
    def attack(family, train, valid, x, y):
        kwargs = {}
        kwargs['family'] = family
        gaussian_links = ["inverse", "log", "identity"]
        binomial_links = ["logit"]
        poisson_links =  ["log", "identity"]
        gamma_links = ["inverse", "log", "identity"]

        # randomly select parameters and their corresponding values
        if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,50)
        if random.random() > 0.8: kwargs['beta_epsilon'] = random.random()
        if random.randint(0,1): kwargs['solver'] = ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE",
                                                    "COORDINATE_DESCENT"][random.randint(0,1)]
        if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)]
        if random.randint(0,1):
            if   family == "gaussian": kwargs['link'] = gaussian_links[random.randint(0,2)]
            elif family == "binomial": kwargs['link'] = binomial_links[random.randint(0,0)]
            elif family == "poisson" : kwargs['link'] = poisson_links[random.randint(0,1)]
            elif family == "gamma"   : kwargs['link'] = gamma_links[random.randint(0,2)]
        if random.randint(0,1): kwargs['alpha'] = [random.random()]
        if family == "binomial":
            if random.randint(0,1): kwargs['prior'] = random.random()
        if random.randint(0,1): kwargs['lambda_search'] = [True, False][random.randint(0,1)]
        if 'lambda_search' in list(kwargs.keys()):
            if random.randint(0,1): kwargs['nlambdas'] = random.randint(2,10)
        do_validation = [True, False][random.randint(0,1)]
        # beta constraints
        if random.randint(0,1):
            bc = []
            for n in x:
                if train[n].isnumeric():
                    name = train.names[n]
                    lower_bound = random.uniform(-1,1)
                    upper_bound = lower_bound + random.random()
                    bc.append([name, lower_bound, upper_bound])
            if len(bc) > 0:
                beta_constraints = h2o.H2OFrame(bc)
                beta_constraints.set_names(['names', 'lower_bounds', 'upper_bounds'])
                kwargs['beta_constraints'] = beta_constraints.frame_id

        # display the parameters and their corresponding values
        print("-----------------------")
        print("x: {0}".format(x))
        print("y: {0}".format(y))
        print("validation: {0}".format(do_validation))
        for k, v in zip(list(kwargs.keys()), list(kwargs.values())):
            if k == 'beta_constraints':
                print(k + ": ")
                beta_constraints.show()
            else:
                print(k + ": {0}".format(v))
        if do_validation: h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs)
        else: h2o.glm(x=train[x], y=train[y], **kwargs)
        print("-----------------------")
Пример #18
0
    def check_same(data1, data2):
        glm1_regression = h2o.glm(x=data1[2:20], y=data1[1])
        glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights")
        glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial")
        glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial")

        assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(glm1_regression.mse(),
                                                                                           glm2_regression.mse())
        assert abs(glm1_binomial.auc() - glm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \
                                                                      "{1}".format(glm1_binomial.auc(), glm2_binomial.auc())
def offset_1897():

    print 'Checking binomial models for GLM with and without offset'
    print 'Import prostate dataset into H2O and R...'
    prostate_hex = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    print "Checking binomial model without offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="binomial",
        standardize=False)
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(379.053509501537)
    assert abs(379.053509501537 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model with offset..."
    prostate_glm_h2o = h2o.glm(x=prostate_hex[[
        "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"
    ]],
                               y=prostate_hex["CAPSULE"],
                               training_frame=prostate_hex,
                               family="binomial",
                               offset_column="AGE",
                               standardize=False)
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(1515.91815848623)
    assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model without offset..."
    prostate_glm_h2o = h2o.glm(
        x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
        y=prostate_hex["CAPSULE"],
        training_frame=prostate_hex,
        family="poisson",
        standardize=False)
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(216.339989007507)
    assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1

    print "Checking binomial model with offset..."
    prostate_glm_h2o = h2o.glm(x=prostate_hex[[
        "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"
    ]],
                               y=prostate_hex["CAPSULE"],
                               training_frame=prostate_hex,
                               family="poisson",
                               offset_column="AGE",
                               standardize=False)
    print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance())
    print "r residual: {0}".format(2761.76218461138)
    assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1
def link_correct_default():
	
	

	print("Reading in original prostate data.")
	h2o_data = h2o.upload_file(path=tests.locate("smalldata/prostate/prostate.csv.zip"))

	print("Compare models with link unspecified and canonical link specified.")
	print("GAUSSIAN: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian")
	h2o_model_specified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian", link="identity")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("BINOMIAL: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial", link="logit")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("POISSON: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("GAMMA: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma")
	h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
Пример #21
0
    def check_same(data1, data2):
        glm1_regression = h2o.glm(x=data1[2:20], y=data1[1])
        glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights", training_frame=data2)
        glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial")
        glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial",training_frame=data2)

        assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(glm1_regression.mse(),
                                                                                           glm2_regression.mse())
        assert abs(glm1_binomial.null_deviance() - glm2_binomial.null_deviance()) < 1e-6, \
            "Expected null deviances to be the same, but got {0}, and {1}".format(glm1_binomial.null_deviance(),
                                                                                  glm2_binomial.null_deviance())
        assert abs(glm1_binomial.residual_deviance() - glm2_binomial.residual_deviance()) < 1e-6, \
            "Expected residual deviances to be the same, but got {0}, and {1}".format(glm1_binomial.residual_deviance(),
                                                                                      glm2_binomial.residual_deviance())
    def check_same(data1, data2):
        glm1_regression = h2o.glm(x=data1[2:20], y=data1[1])
        glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights", training_frame=data2)
        glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial")
        glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial",training_frame=data2)

        assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(glm1_regression.mse(),
                                                                                           glm2_regression.mse())
        assert abs(glm1_binomial.null_deviance() - glm2_binomial.null_deviance()) < 1e-6, \
            "Expected null deviances to be the same, but got {0}, and {1}".format(glm1_binomial.null_deviance(),
                                                                                  glm2_binomial.null_deviance())
        assert abs(glm1_binomial.residual_deviance() - glm2_binomial.residual_deviance()) < 1e-6, \
            "Expected residual deviances to be the same, but got {0}, and {1}".format(glm1_binomial.residual_deviance(),
                                                                                      glm2_binomial.residual_deviance())
Пример #23
0
def pyunit_make_glm_model():
    # TODO: PUBDEV-1717
    pros = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv"))
    model = h2o.glm(x=pros[["AGE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]],
                    y=pros["CAPSULE"],
                    family="gaussian",
                    alpha=[0])
    new_betas = {
        "AGE": 0.5,
        "DPROS": 0.5,
        "DCAPS": 0.5,
        "PSA": 0.5,
        "VOL": 0.5,
        "GLEASON": 0.5
    }

    names = '['
    for n in new_betas.keys():
        names += "\"" + n + "\","
    names = names[0:len(names) - 1] + "]"
    betas = '['

    for b in new_betas.values():
        betas += str(b) + ","
    betas = betas[0:len(betas) - 1] + "]"
    res = h2o.H2OConnection.post_json("MakeGLMModel",
                                      model=model._id,
                                      names=names,
                                      beta=betas)
def link_functions_tweedie_basic(ip, port):

    print "Read in prostate data."
    hdf = h2o.upload_file(
        h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))

    print "Testing for family: TWEEDIE"
    print "Set variables for h2o."
    y = "CAPSULE"
    x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"]

    print "Create models with canonical link: TWEEDIE"
    model_h2o_tweedie = h2o.glm(x=hdf[x],
                                y=hdf[y],
                                family="tweedie",
                                link="tweedie",
                                alpha=[0.5],
                                Lambda=[0])

    print "Compare model deviances for link function tweedie (using precomputed values from R)"
    deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance(
    ) / model_h2o_tweedie.null_deviance()

    assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \
                                                    "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
def perfectSeparation_balanced(ip, port):

    # Connect to h2o
    h2o.init(ip, port)

    print("Read in synthetic balanced dataset")
    data = h2o.import_frame(
        path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv"))

    print("Fit model on dataset")
    model = h2o.glm(x=data[["x1", "x2"]],
                    y=data["y"],
                    family="binomial",
                    lambda_search=True,
                    use_all_factor_levels=True,
                    alpha=[0.5],
                    Lambda=[0])

    print(
        "Extract models' coefficients and assert reasonable values (ie. no greater than 50)"
    )
    print("Balanced dataset")
    coef = [
        c[1]
        for c in model._model_json['output']['coefficients_table'].cell_values
        if c[0] != "Intercept"
    ]
    for c in coef:
        assert c < 50, "coefficient is too large"
def link_functions_tweedie_vpow(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # Load example data from HDtweedie, y = aggregate claim loss
    hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv"))
    y = "y"
    x = list(set(hdf.names()) - set(["y"]))

    print "Testing for family: TWEEDIE"
    print "Create models with canonical link: TWEEDIE"
    # Iterate over different variance powers for tweedie
    vpower = [0, 1, 1.5]
    r_dev = [0.7516627, 0.6708826, 0.7733762]
    r_null = [221051.88369951, 32296.29783702, 20229.47425307]
    for ridx, vpow in enumerate(vpower):
        print "Fit h2o.glm:"
        h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow,
                         alpha=[0.5], Lambda=[0])

        print "Testing Tweedie variance power: {0}".format(vpow)

        print "Compare model deviances for link function tweedie"
        deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance()

        assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                           "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

        print "compare null and residual deviance between R glm and h2o.glm for tweedie"
        assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                                   "{1}".format(h2ofit.null_deviance(), r_null[ridx])
Пример #27
0
def pubdev_1953():

    # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")]
    # data = h2o.import_file(path=small_test)
    # startime = data["starttime"]
    # secsPerDay=1000*60*60*24
    # data["Days"] = (startime/secsPerDay).floor()
    # grouped = data.group_by(["Days","start station name"])
    # bpd = grouped.count(name="bikes").get_frame()
    # secs = bpd["Days"]*secsPerDay
    # bpd["Month"]     = secs.month().asfactor()
    # bpd["DayOfWeek"] = secs.dayOfWeek()
    # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
    # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
    # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
    # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
    # wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
    # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
    # secsPerDay=1000*60*60*24
    # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
    # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
    # rain = wthr4["Rain (mm)"]
    # rain[ rain.isna() ] = 0
    # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
    # r = bpd_with_weather['Days'].runif(seed=356964763)
    # train = bpd_with_weather[  r  < 0.6]
    # test  = bpd_with_weather[(0.6 <= r) & (r < 0.9)]

    predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)']

    train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv"))
    test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv"))

    glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def wide_dataset_large():
    
    

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
def prostate(ip, port):

    # Connect to h2o
    h2o.init(ip, port)

    # Log.info("Importing prostate.csv data...\n")
    h2o_data = h2o.upload_file(
        path=h2o.locate("smalldata/logreg/prostate.csv"))
    #prostate.summary()

    sm_data = pd.read_csv(
        h2o.locate("smalldata/logreg/prostate.csv")).as_matrix()
    sm_data_response = sm_data[:, 1]
    sm_data_features = sm_data[:, 2:]

    #Log.info(cat("B)H2O GLM (binomial) with parameters:\nX:", myX, "\nY:", myY, "\n"))
    h2o_glm = h2o.glm(y=h2o_data[1],
                      x=h2o_data[2:],
                      family="binomial",
                      n_folds=10,
                      alpha=[0.5])
    h2o_glm.show()

    sm_glm = sm.GLM(endog=sm_data_response,
                    exog=sm_data_features,
                    family=sm.families.Binomial()).fit()

    assert abs(sm_glm.null_deviance -
               h2o_glm._model_json['output']['null_deviance']
               ) < 1e-5, "Expected null deviances to be the same"
Пример #30
0
def pubdev_1953():

    # small_test = [tests.locate("bigdata/laptop/citibike-nyc/2013-10.csv")]
    # data = h2o.import_file(path=small_test)
    # startime = data["starttime"]
    # secsPerDay=1000*60*60*24
    # data["Days"] = (startime/secsPerDay).floor()
    # grouped = data.group_by(["Days","start station name"])
    # bpd = grouped.count(name="bikes").get_frame()
    # secs = bpd["Days"]*secsPerDay
    # bpd["Month"]     = secs.month().asfactor()
    # bpd["DayOfWeek"] = secs.dayOfWeek()
    # wthr1 = h2o.import_file(path=[tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
    # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
    # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
    # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
    # wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
    # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
    # secsPerDay=1000*60*60*24
    # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
    # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
    # rain = wthr4["Rain (mm)"]
    # rain[ rain.isna() ] = 0
    # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
    # r = bpd_with_weather['Days'].runif(seed=356964763)
    # train = bpd_with_weather[  r  < 0.6]
    # test  = bpd_with_weather[(0.6 <= r) & (r < 0.9)]

    predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)']

    train = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_train.csv"))
    test = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_test.csv"))

    glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
Пример #31
0
def perfectSeparation_unbalanced():

    print("Read in synthetic unbalanced dataset")
    data = h2o.import_file(
        tests.locate("smalldata/synthetic_perfect_separation/unbalanced.csv"))

    print("Fit model on dataset.")
    model = h2o.glm(x=data[["x1", "x2"]],
                    y=data["y"],
                    family="binomial",
                    lambda_search=True,
                    alpha=[0.5],
                    Lambda=[0])

    print(
        "Extract models' coefficients and assert reasonable values (ie. no greater than 50)"
    )
    print("Unbalanced dataset")
    coef = [
        c[1]
        for c in model._model_json['output']['coefficients_table'].cell_values
        if c[0] != "Intercept"
    ]
    for c in coef:
        assert c < 50, "coefficient is too large"
Пример #32
0
def save_load_model():

    prostate = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"],
                           x=prostate[["AGE", "RACE", "PSA", "DCAPS"]],
                           family="binomial",
                           alpha=[0.5])

    path = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results"))

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isdir(
        model_path
    ), "Expected load directory {0} to exist, but it does not.".format(
        model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(
        the_model,
        H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(
            the_model)
def link_functions_binomial():
  
  

  print("Read in prostate data.")
  h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip"))
  h2o_data.head()

  sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
  sm_data_response = sm_data[:,2]
  sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

  print("Testing for family: BINOMIAL")
  print("Set variables for h2o.")
  myY = "CAPSULE"
  myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

  print("Create models with canonical link: LOGIT")
  h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
  sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

  print("Compare model deviances for link function logit")
  h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
  sm_deviance = sm_model.deviance / sm_model.null_deviance
  assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Пример #34
0
def benign(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    #Log.info("Build the model")
    model = h2o.glm(y=training_data[Y].asfactor(),
                    x=training_data[X],
                    family="binomial",
                    alpha=[0],
                    Lambda=[1e-5])

    #Log.info("Check that the columns used in the model are the ones we passed in.")
    #Log.info("===================Columns passed in: ================")
    in_names = [training_data.names()[i] for i in X]
    #Log.info("===================Columns passed out: ================")
    out_names = [
        model._model_json['output']['coefficients_table'].cell_values[c][0]
        for c in range(len(X) + 1)
    ]
    assert in_names == out_names[1:]
def link_functions_gaussian(ip,port):
    
    

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).
                          open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:,9]
    sm_data_features = sm_data[:,1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
                      family=sm.families.Gaussian(sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gaussian():

    print("Read in prostate data.")
    h2o_data = h2o.import_file(
        path=tests.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX],
                        y=h2o_data[myY],
                        family="gaussian",
                        link="identity",
                        alpha=[0.5],
                        Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response,
                      exog=sm_data_features,
                      family=sm.families.Gaussian(
                          sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Пример #37
0
def glm_solvers():

    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement", "power", "weight", "acceleration", "year"]

    for solver in [
            "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE",
            "COORDINATE_DESCENT"
    ]:
        print "Solver = {0}".format(solver)
        for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]:
            if family == 'binomial': response_col = "economy_20mpg"
            elif family == 'gaussian': response_col = "economy"
            else: response_col = "cylinders"
            print "Family = {0}".format(family)

            if family == 'binomial':
                training_data[response_col] = training_data[
                    response_col].asfactor()
            else:
                training_data[response_col] = training_data[
                    response_col].asnumeric()

            model = h2o.glm(x=training_data[predictors],
                            y=training_data[response_col],
                            family=family,
                            alpha=[0],
                            Lambda=[1e-5],
                            solver=solver)
Пример #38
0
def wide_dataset_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1], use_all_factor_levels=True)

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
def link_functions_binomial(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,2]
	sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

	print("Testing for family: BINOMIAL")
	print("Set variables for h2o.")
	myY = "CAPSULE"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

	print("Create models with canonical link: LOGIT")
	h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
	sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

	print("Compare model deviances for link function logit")
	h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance']
	sm_deviance = sm_model.deviance / sm_model.null_deviance
	assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_tweedie_vpow():
    
    

    # Load example data from HDtweedie, y = aggregate claim loss
    hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv"))
    y = "y"
    x = list(set(hdf.names) - set(["y"]))

    print "Testing for family: TWEEDIE"
    print "Create models with canonical link: TWEEDIE"
    # Iterate over different variance powers for tweedie
    vpower = [0, 1, 1.5]
    r_dev = [0.7516627, 0.6708826, 0.7733762]
    r_null = [221051.88369951, 32296.29783702, 20229.47425307]
    for ridx, vpow in enumerate(vpower):
        print "Fit h2o.glm:"
        h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow,
                         alpha=[0.5], Lambda=[0])

        print "Testing Tweedie variance power: {0}".format(vpow)

        print "Compare model deviances for link function tweedie"
        deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance()

        assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                           "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

        print "compare null and residual deviance between R glm and h2o.glm for tweedie"
        assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                                   "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def grid_lambda_search():
  
  

  # Log.info("Importing prostate.csv data...\n")
  prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

  #prostate.summary()

  # Log.info("H2O GLM (binomial) with parameters: alpha = c(0.25, 0.5), nlambda = 20, lambda_search = TRUE, nfolds: 2\n")
  model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", nlambdas=5, lambda_search=True, n_folds=2)
  if random.random() < 0.5:
    model_idx = 0
  else:
    model_idx = 1

  model_bestlambda = model.models(model_idx)
  params_bestlambda = model.params()

  # Log.info(cat("All lambda values returned:\n", params_bestlambda.lambdas()))
  assert len(params_bestlambda.lambdas()) <= 5, "expected 5 or less lambdas"

  random_lambda = random.choice(params_bestlambda.lambdas())
  print("RANDOM LAMBDA")
  print(random_lambda)

  # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and randomly chosen lambda", random_lambda, "\n"))
  random_model = model.getGLMLambdaModel(model_bestlambda, random_lambda)

  # Log.info("EXPECTING THESE TO BE EQUAL")
  print(random_model.Lambda())
  print(random_lambda)

  assert random_model.Lambda() == random_lambda, "expected lambdas to be equal"

  # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and best lambda", params_bestlambda.lambdaBest(), "\n"))
  best_model = h2o.getGLMLambdaModel(model_bestlambda, params_bestlambda.lambda_best())
  assert best_model.model() ==  model_bestlambda.model(), "expected models to be equal"

  # Log.info("H2O GLM (binomial) with parameters: alpha = [0.25, 0.5], nlambda = 20, lambda_search = TRUE, nfolds: 2\n")
  prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", alpha=[0.25, 0.5], nlambdas=5, lambda_search=True, n_folds=2)
  model_search = prostate_search.models(model_idx)
  models_best = model_search.models(model_search.best_model())
  params_best = models_best.params()

  assert params_bestlambda.lambda_best() == params_best.lambda_best(), "expected lambdas to be equal"
  assert len(params_best.lambda_all()) <= 20, "expected 20 or fewer lambdas"
def link_functions_poisson():

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))

    sm_data = pd.read_csv(
        zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
            "prostate_complete.csv"
        )
    ).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: POISSON")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create h2o model with canonical link: LOG")
    h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="log", alpha=[0.5], Lambda=[0])

    print("Create statsmodel model with canonical link: LOG")
    sm_model_log = sm.GLM(
        endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson(sm.families.links.log)
    ).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = old_div(h2o_model_log.residual_deviance(), h2o_model_log.null_deviance())
    sm_deviance_log = old_div(sm_model_log.deviance, sm_model_log.null_deviance)
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create h2o models with link: IDENTITY")
    h2o_model_id = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="identity", alpha=[0.5], Lambda=[0])

    print("Create statsmodel models with link: IDENTITY")
    sm_model_id = sm.GLM(
        endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson(sm.families.links.identity)
    ).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance_id = old_div(h2o_model_id.residual_deviance(), h2o_model_id.null_deviance())
    sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance)
    assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
def pubdev_1839():

    train = h2o.import_file(pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test  = h2o.import_file(pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x           =train.drop("bikes"),
                   y           =train     ["bikes"],
                   validation_x=test .drop("bikes"),
                   validation_y=test      ["bikes"],
                   family="poisson")
def pubdev_1839(ip, port):

    train = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test  = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x           =train.drop("bikes"),
                   y           =train     ["bikes"],
                   validation_x=test .drop("bikes"),
                   validation_y=test      ["bikes"],
                   Lambda=[1e-5],
                   family="poisson")
def pubdev_1839(ip, port):

    train = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test  = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x           =train.drop("bikes"),
                   y           =train     ["bikes"],
                   validation_x=test .drop("bikes"),
                   validation_y=test      ["bikes"],
                   Lambda=[1e-5],
                   family="poisson")
def pubdev_1839():

    train = h2o.import_file(
        pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x=train.drop("bikes"),
                   y=train["bikes"],
                   validation_x=test.drop("bikes"),
                   validation_y=test["bikes"],
                   family="poisson")
Пример #47
0
def save_load_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm, name="delete_model", force=True)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Пример #48
0
def save_load_model():
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Пример #49
0
def save_load_model(ip,port):
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def pyunit_make_glm_model():
    # TODO: PUBDEV-1717
    pros = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv"))
    model = h2o.glm(x=pros[["AGE","DPROS","DCAPS","PSA","VOL","GLEASON"]], y=pros["CAPSULE"], family="gaussian", alpha=[0])
    new_betas = {"AGE":0.5, "DPROS":0.5, "DCAPS":0.5, "PSA":0.5, "VOL":0.5, "GLEASON":0.5}

    names = '['
    for n in new_betas.keys(): names += "\""+n+"\","
    names = names[0:len(names)-1]+"]"
    betas = '['

    for b in new_betas.values(): betas += str(b)+","
    betas = betas[0:len(betas)-1]+"]"
    res = h2o.H2OConnection.post_json("MakeGLMModel",model=model._id,names=names,beta=betas)
def prostate():

    h2o_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    h2o_data.summary()

    sm_data = pd.read_csv(pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix()
    sm_data_response = sm_data[:, 1]
    sm_data_features = sm_data[:, 2:]

    h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5])
    sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit()

    print "statsmodels null deviance {0}".format(sm_glm.null_deviance)
    print "h2o null deviance {0}".format(h2o_glm.null_deviance())
    assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
def glm_solvers():
    predictors = ["displacement","power","weight","acceleration","year"]

    for solver in ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"]:
        print("Solver = {0}".format(solver))
        for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]:
            if   family == 'binomial': response_col = "economy_20mpg"
            elif family == 'gaussian': response_col = "economy"
            else:                      response_col = "cylinders"
            print("Family = {0}".format(family))
            training_data = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
            if   family == 'binomial': training_data[response_col] = training_data[response_col].asfactor()
            else:                      training_data[response_col] = training_data[response_col].asnumeric()
            model = h2o.glm(x=training_data[predictors], y=training_data[response_col], family=family, alpha=[0], Lambda=[1e-5], solver=solver)
            h2o.remove(training_data)
def perfectSeparation_unbalanced():
    
    

    print("Read in synthetic unbalanced dataset")
    data = h2o.import_file(pyunit_utils.locate("smalldata/synthetic_perfect_separation/unbalanced.csv"))

    print("Fit model on dataset.")
    model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, alpha=[0.5], Lambda=[0])

    print("Extract models' coefficients and assert reasonable values (ie. no greater than 50)")
    print("Unbalanced dataset")
    coef = [c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept"]
    for c in coef:
        assert c < 50, "coefficient is too large"
Пример #54
0
def prostate():

  h2o_data = h2o.upload_file(path=tests.locate("smalldata/logreg/prostate.csv"))
  h2o_data.summary()

  sm_data = pd.read_csv(tests.locate("smalldata/logreg/prostate.csv")).as_matrix()
  sm_data_response = sm_data[:,1]
  sm_data_features = sm_data[:,2:]

  h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5])
  sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit()

  print "statsmodels null deviance {0}".format(sm_glm.null_deviance)
  print "h2o null deviance {0}".format(h2o_glm.null_deviance())
  assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
Пример #55
0
def save_load_model():

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])

    path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results"))

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def perfectSeparation_balanced(ip,port):

    # Connect to h2o
    h2o.init(ip,port)

    print("Read in synthetic balanced dataset")
    data = h2o.import_frame(path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv"))

    print("Fit model on dataset")
    model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, use_all_factor_levels=True, alpha=[0.5], Lambda=[0])

    print("Extract models' coefficients and assert reasonable values (ie. no greater than 50)")
    print("Balanced dataset")
    coef = [c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept"]
    for c in coef:
        assert c < 50, "coefficient is too large"
def benign():
    
    

    training_data = h2o.import_file(tests.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4,11)

    #Log.info("Build the model")
    model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5])

    #Log.info("Check that the columns used in the model are the ones we passed in.")
    #Log.info("===================Columns passed in: ================")
    in_names = [training_data.names[i] for i in X]
    #Log.info("===================Columns passed out: ================")
    out_names = [model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X)+1)]    
    assert in_names == out_names[1:]
Пример #58
0
def glm_mean_residual_deviance(ip,port):

    cars =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars[0].runif()
    train = cars[s > 0.2]
    valid = cars[s <= 0.2]
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy"
    glm = h2o.glm(x=train[predictors],
                  y=train[response_col],
                  validation_x=valid[predictors],
                  validation_y=valid[response_col],
                  nfolds=3)
    glm_mrd = glm.mean_residual_deviance(train=True,valid=True,xval=True)
    assert isinstance(glm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \
                                              "{0}".format(type(glm_mrd['train']))
    assert isinstance(glm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \
                                              "{0}".format(type(glm_mrd['valid']))
    assert isinstance(glm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \
                                             "{0}".format(type(glm_mrd['xval']))