def link_incompatible_error(ip, port): print("Reading in original prostate data.") prostate = h2o.import_file( path=h2o.locate("smalldata/prostate/prostate.csv.zip")) print( "Throw error when trying to create model with incompatible logit link." ) try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="gaussian", link="logit") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="tweedie", link="log") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", link="inverse") assert False, "expected an error" except EnvironmentError: assert True
def link_incompatible_error(): print("Reading in original prostate data.") prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip")) print("Throw error when trying to create model with incompatible logit link.") try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="gaussian", link="logit") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[1:8], y=prostate[8], family="tweedie", link="log") assert False, "expected an error" except EnvironmentError: assert True try: h2o.model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", link="inverse") assert False, "expected an error" except EnvironmentError: assert True
def covtype_get_model(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = range(0,20) + range(29,54) # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype_data.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1._id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2._id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3._id) covtype_mod3.show()
def getLambdaModel(): print("Read data") prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) myX = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] myY = "CAPSULE" family = random.choice(["gaussian","binomial"]) print(family) print("Do lambda search and build models") if family == "gaussian": model = h2o.glm(x=prostate[myX], y=prostate[myY], family=family, standardize=True, use_all_factor_levels=True, lambda_search=True) else: model = h2o.glm(x=prostate[myX], y=prostate[myY].asfactor(), family=family, standardize=True, use_all_factor_levels=True, lambda_search=True) print("the models were built over the following lambda values: ") all_lambdas = model.models(1).lambda_all() print(all_lambdas) for i in range(10): Lambda = random.sample(all_lambdas,1) print("For Lambda we get this model:") m1 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda)) m1.show() print("this model should be same as the one above:") m2 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda)) m2.show() assert m1==m2, "expected models to be equal"
def link_functions_gamma(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in prostate data.") h2o_data = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 5] sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]] print("Testing for family: GAMMA") print("Set variables for h2o.") myY = "DPROS" myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"] print("Create models with canonical link: INVERSE") h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse", alpha=[0.5], Lambda=[0], n_folds=0) sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.inverse_power)).fit() print("Compare model deviances for link function inverse") h2o_deviance_in = h2o_model_in._model_json['output'][ 'residual_deviance'] / h2o_model_in._model_json['output'][ 'null_deviance'] sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create models with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log", alpha=[0.5], Lambda=[0], n_folds=0) sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = h2o_model_log._model_json['output'][ 'residual_deviance'] / h2o_model_log._model_json['output'][ 'null_deviance'] sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gamma(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,5] sm_data_features = sm_data[:,[1,2,3,4,6,7,8,9]] print("Testing for family: GAMMA") print("Set variables for h2o.") myY = "DPROS" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","CAPSULE"] print("Create models with canonical link: INVERSE") h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse",alpha=[0.5], Lambda=[0]) sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.inverse_power)).fit() print("Compare model deviances for link function inverse") h2o_deviance_in = h2o_model_in._model_json['output']['residual_deviance'] / h2o_model_in._model_json['output']['null_deviance'] sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create models with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log",alpha=[0.5], Lambda=[0]) sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = h2o_model_log._model_json['output']['residual_deviance'] / h2o_model_log._model_json['output']['null_deviance'] sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def shuffling_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True) print("Create second model on original Arcene dataset.") h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True) print("Create model on shuffled Arcene dataset.") h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], n_folds=0, use_all_factor_levels=True) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def covtype(ip,port): # Connect to h2o h2o.init(ip,port) # Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data")) # myY = 54 myX = [x for x in range(0,54) if x not in [20,28]] # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0], Lambda=[0]) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[1], Lambda=[1e-4]) covtype_mod3.show()
def getLambdaModel(ip,port): print("Read data") prostate = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv")) myX = ["AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] myY = "CAPSULE" family = random.choice(["gaussian","binomial"]) print(family) print("Do lambda search and build models") if family == "gaussian": model = h2o.glm(x=prostate[myX], y=prostate[myY], family=family, standardize=True, use_all_factor_levels=True, lambda_search=True) else: model = h2o.glm(x=prostate[myX], y=prostate[myY].asfactor(), family=family, standardize=True, use_all_factor_levels=True, lambda_search=True) print("the models were built over the following lambda values: ") all_lambdas = model.models(1).lambda_all() print(all_lambdas) for i in range(10): Lambda = random.sample(all_lambdas,1) print("For Lambda we get this model:") m1 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda)) m1.show() print("this model should be same as the one above:") m2 = h2o.getGLMLambdaModel(model.models(random.randint(0,len(model.models()-1)),Lambda=Lambda)) m2.show() assert m1==m2, "expected models to be equal"
def lambda_search(ip,port): # Connect to h2o h2o.init(ip,port) #Log.info("Importing prostate.csv data...\n") prostate = h2o.import_frame(h2o.locate("smalldata/logreg/prostate.csv")) #prostate.summary() # GLM without lambda search, lambda is single user-provided value #Log.info("H2O GLM (binomial) with parameters: lambda_search = TRUE, nfolds: 2\n") prostate_nosearch = h2o.glm(x=prostate[2:9], y=prostate[1], training_frame = prostate.hex, family = "binomial", nlambdas = 5, lambda_search = False, n_folds = 2) params_nosearch = prostate_nosearch.params() try: prostate_nosearch.getGLMLambdaModel(0.5) assert False, "expected an error" except EnvironmentError: assert True # GLM with lambda search, return only model corresponding to best lambda as determined by H2O #Log.info("H2O GLM (binomial) with parameters: lambda_search: TRUE, nfolds: 2\n") prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], training_frame = prostate.hex, family = "binomial", nlambdas = 5, lambda_search = True, n_folds = 2) params_search = prostate_search.params() random_lambda = random.choice(prostate_search.lambda_all()) #Log.info(cat("Retrieving model corresponding to randomly chosen lambda", random_lambda, "\n")) random_model = prostate_search.getGLMLambdaModel(random_lambda) assert random_model.getLambda() == random_lambda, "expected equal lambdas" #Log.info(cat("Retrieving model corresponding to best lambda", params.bestlambda$lambda_best, "\n")) best_model = prostate_search.getGLMLambdaModel(params_search.bestlambda()) assert best_model.model() == prostate_search.model(), "expected models to be equal"
def covtype(ip,port): # Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) # myY = 54 myX = [x for x in range(0,54) if x not in [20,28]] # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show()
def shuffling_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create second model on original Arcene dataset.") h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create model on shuffled Arcene dataset.") h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def covtype_get_model(): #Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = list(range(0,20)) + list(range(29,54)) # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype_data.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1._id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2._id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3._id) covtype_mod3.show()
def link_functions_poisson(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) sm_data = pd.read_csv( zipfile.ZipFile( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip") ).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: POISSON") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create h2o model with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="log", alpha=[0.5], Lambda=[0]) print("Create statsmodel model with canonical link: LOG") sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson( sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = old_div(h2o_model_log.residual_deviance(), h2o_model_log.null_deviance()) sm_deviance_log = old_div(sm_model_log.deviance, sm_model_log.null_deviance) assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create h2o models with link: IDENTITY") h2o_model_id = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="identity", alpha=[0.5], Lambda=[0]) print("Create statsmodel models with link: IDENTITY") sm_model_id = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson( sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance_id = old_div(h2o_model_id.residual_deviance(), h2o_model_id.null_deviance()) sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance) assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
def attack(family, train, valid, x, y): kwargs = {} kwargs['family'] = family gaussian_links = ["inverse", "log", "identity"] binomial_links = ["logit"] poisson_links = ["log", "identity"] gamma_links = ["inverse", "log", "identity"] # randomly select parameters and their corresponding values if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,50) if random.random() > 0.8: kwargs['beta_epsilon'] = random.random() if random.randint(0,1): kwargs['solver'] = ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"][random.randint(0,1)] if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)] if random.randint(0,1): if family == "gaussian": kwargs['link'] = gaussian_links[random.randint(0,2)] elif family == "binomial": kwargs['link'] = binomial_links[random.randint(0,0)] elif family == "poisson" : kwargs['link'] = poisson_links[random.randint(0,1)] elif family == "gamma" : kwargs['link'] = gamma_links[random.randint(0,2)] if random.randint(0,1): kwargs['alpha'] = [random.random()] if family == "binomial": if random.randint(0,1): kwargs['prior'] = random.random() if random.randint(0,1): kwargs['lambda_search'] = [True, False][random.randint(0,1)] if 'lambda_search' in kwargs.keys(): if random.randint(0,1): kwargs['nlambdas'] = random.randint(2,10) do_validation = [True, False][random.randint(0,1)] # beta constraints if random.randint(0,1): bc = [] for n in x: if train[n].isnumeric(): name = train.names[n] lower_bound = random.uniform(-1,1) upper_bound = lower_bound + random.random() bc.append([name, lower_bound, upper_bound]) if len(bc) > 0: beta_constraints = h2o.H2OFrame(bc) beta_constraints.set_names(['names', 'lower_bounds', 'upper_bounds']) kwargs['beta_constraints'] = beta_constraints.frame_id # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) print "y: {0}".format(y) print "validation: {0}".format(do_validation) for k, v in zip(kwargs.keys(), kwargs.values()): if k == 'beta_constraints': print k + ": " beta_constraints.show() else: print k + ": {0}".format(v) if do_validation: m = H2OGeneralizedLinearEstimator(**kwargs) h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) else: h2o.glm(x=train[x], y=train[y], **kwargs) print "-----------------------"
def offset_1897(): print "Checking binomial models for GLM with and without offset" print "Import prostate dataset into H2O and R..." prostate_hex = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv")) print "Checking binomial model without offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="binomial", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(379.053509501537) assert abs(379.053509501537 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model with offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="binomial", offset_column="AGE", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(1515.91815848623) assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model without offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(216.339989007507) assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model with offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", offset_column="AGE", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(2761.76218461138) assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1
def attack(family, train, valid, x, y): kwargs = {} kwargs['family'] = family gaussian_links = ["inverse", "log", "identity"] binomial_links = ["logit"] poisson_links = ["log", "identity"] gamma_links = ["inverse", "log", "identity"] # randomly select parameters and their corresponding values if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,50) if random.random() > 0.8: kwargs['beta_epsilon'] = random.random() if random.randint(0,1): kwargs['solver'] = ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"][random.randint(0,1)] if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)] if random.randint(0,1): if family == "gaussian": kwargs['link'] = gaussian_links[random.randint(0,2)] elif family == "binomial": kwargs['link'] = binomial_links[random.randint(0,0)] elif family == "poisson" : kwargs['link'] = poisson_links[random.randint(0,1)] elif family == "gamma" : kwargs['link'] = gamma_links[random.randint(0,2)] if random.randint(0,1): kwargs['alpha'] = [random.random()] if family == "binomial": if random.randint(0,1): kwargs['prior'] = random.random() if random.randint(0,1): kwargs['lambda_search'] = [True, False][random.randint(0,1)] if 'lambda_search' in list(kwargs.keys()): if random.randint(0,1): kwargs['nlambdas'] = random.randint(2,10) do_validation = [True, False][random.randint(0,1)] # beta constraints if random.randint(0,1): bc = [] for n in x: if train[n].isnumeric(): name = train.names[n] lower_bound = random.uniform(-1,1) upper_bound = lower_bound + random.random() bc.append([name, lower_bound, upper_bound]) if len(bc) > 0: beta_constraints = h2o.H2OFrame(bc) beta_constraints.set_names(['names', 'lower_bounds', 'upper_bounds']) kwargs['beta_constraints'] = beta_constraints.frame_id # display the parameters and their corresponding values print("-----------------------") print("x: {0}".format(x)) print("y: {0}".format(y)) print("validation: {0}".format(do_validation)) for k, v in zip(list(kwargs.keys()), list(kwargs.values())): if k == 'beta_constraints': print(k + ": ") beta_constraints.show() else: print(k + ": {0}".format(v)) if do_validation: h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) else: h2o.glm(x=train[x], y=train[y], **kwargs) print("-----------------------")
def check_same(data1, data2): glm1_regression = h2o.glm(x=data1[2:20], y=data1[1]) glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights") glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial") glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial") assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(glm1_regression.mse(), glm2_regression.mse()) assert abs(glm1_binomial.auc() - glm2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \ "{1}".format(glm1_binomial.auc(), glm2_binomial.auc())
def offset_1897(): print 'Checking binomial models for GLM with and without offset' print 'Import prostate dataset into H2O and R...' prostate_hex = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) print "Checking binomial model without offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="binomial", standardize=False) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(379.053509501537) assert abs(379.053509501537 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model with offset..." prostate_glm_h2o = h2o.glm(x=prostate_hex[[ "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE" ]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="binomial", offset_column="AGE", standardize=False) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(1515.91815848623) assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model without offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", standardize=False) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(216.339989007507) assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model with offset..." prostate_glm_h2o = h2o.glm(x=prostate_hex[[ "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE" ]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", offset_column="AGE", standardize=False) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(2761.76218461138) assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1
def link_correct_default(): print("Reading in original prostate data.") h2o_data = h2o.upload_file(path=tests.locate("smalldata/prostate/prostate.csv.zip")) print("Compare models with link unspecified and canonical link specified.") print("GAUSSIAN: ") h2o_model_unspecified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian") h2o_model_specified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian", link="identity") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("BINOMIAL: ") h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial") h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial", link="logit") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("POISSON: ") h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson") h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("GAMMA: ") h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma") h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
def check_same(data1, data2): glm1_regression = h2o.glm(x=data1[2:20], y=data1[1]) glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights", training_frame=data2) glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial") glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial",training_frame=data2) assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(glm1_regression.mse(), glm2_regression.mse()) assert abs(glm1_binomial.null_deviance() - glm2_binomial.null_deviance()) < 1e-6, \ "Expected null deviances to be the same, but got {0}, and {1}".format(glm1_binomial.null_deviance(), glm2_binomial.null_deviance()) assert abs(glm1_binomial.residual_deviance() - glm2_binomial.residual_deviance()) < 1e-6, \ "Expected residual deviances to be the same, but got {0}, and {1}".format(glm1_binomial.residual_deviance(), glm2_binomial.residual_deviance())
def pyunit_make_glm_model(): # TODO: PUBDEV-1717 pros = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv")) model = h2o.glm(x=pros[["AGE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=pros["CAPSULE"], family="gaussian", alpha=[0]) new_betas = { "AGE": 0.5, "DPROS": 0.5, "DCAPS": 0.5, "PSA": 0.5, "VOL": 0.5, "GLEASON": 0.5 } names = '[' for n in new_betas.keys(): names += "\"" + n + "\"," names = names[0:len(names) - 1] + "]" betas = '[' for b in new_betas.values(): betas += str(b) + "," betas = betas[0:len(betas) - 1] + "]" res = h2o.H2OConnection.post_json("MakeGLMModel", model=model._id, names=names, beta=betas)
def link_functions_tweedie_basic(ip, port): print "Read in prostate data." hdf = h2o.upload_file( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) print "Testing for family: TWEEDIE" print "Set variables for h2o." y = "CAPSULE" x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"] print "Create models with canonical link: TWEEDIE" model_h2o_tweedie = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", alpha=[0.5], Lambda=[0]) print "Compare model deviances for link function tweedie (using precomputed values from R)" deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance( ) / model_h2o_tweedie.null_deviance() assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \ "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
def perfectSeparation_balanced(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in synthetic balanced dataset") data = h2o.import_frame( path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv")) print("Fit model on dataset") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, use_all_factor_levels=True, alpha=[0.5], Lambda=[0]) print( "Extract models' coefficients and assert reasonable values (ie. no greater than 50)" ) print("Balanced dataset") coef = [ c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept" ] for c in coef: assert c < 50, "coefficient is too large"
def link_functions_tweedie_vpow(ip,port): # Connect to h2o h2o.init(ip,port) # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names()) - set(["y"])) print "Testing for family: TWEEDIE" print "Create models with canonical link: TWEEDIE" # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print "Fit h2o.glm:" h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=[0.5], Lambda=[0]) print "Testing Tweedie variance power: {0}".format(vpow) print "Compare model deviances for link function tweedie" deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance() assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print "compare null and residual deviance between R glm and h2o.glm for tweedie" assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def pubdev_1953(): # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")] # data = h2o.import_file(path=small_test) # startime = data["starttime"] # secsPerDay=1000*60*60*24 # data["Days"] = (startime/secsPerDay).floor() # grouped = data.group_by(["Days","start station name"]) # bpd = grouped.count(name="bikes").get_frame() # secs = bpd["Days"]*secsPerDay # bpd["Month"] = secs.month().asfactor() # bpd["DayOfWeek"] = secs.dayOfWeek() # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")]) # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]] # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)") # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1") # wthr3 = wthr2[ wthr2["Hour Local"]==12 ] # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"]) # secsPerDay=1000*60*60*24 # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor() # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec") # rain = wthr4["Rain (mm)"] # rain[ rain.isna() ] = 0 # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False) # r = bpd_with_weather['Days'].runif(seed=356964763) # train = bpd_with_weather[ r < 0.6] # test = bpd_with_weather[(0.6 <= r) & (r < 0.9)] predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)'] train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv")) test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv")) glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def wide_dataset_large(): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def prostate(ip, port): # Connect to h2o h2o.init(ip, port) # Log.info("Importing prostate.csv data...\n") h2o_data = h2o.upload_file( path=h2o.locate("smalldata/logreg/prostate.csv")) #prostate.summary() sm_data = pd.read_csv( h2o.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:, 1] sm_data_features = sm_data[:, 2:] #Log.info(cat("B)H2O GLM (binomial) with parameters:\nX:", myX, "\nY:", myY, "\n")) h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", n_folds=10, alpha=[0.5]) h2o_glm.show() sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() assert abs(sm_glm.null_deviance - h2o_glm._model_json['output']['null_deviance'] ) < 1e-5, "Expected null deviances to be the same"
def pubdev_1953(): # small_test = [tests.locate("bigdata/laptop/citibike-nyc/2013-10.csv")] # data = h2o.import_file(path=small_test) # startime = data["starttime"] # secsPerDay=1000*60*60*24 # data["Days"] = (startime/secsPerDay).floor() # grouped = data.group_by(["Days","start station name"]) # bpd = grouped.count(name="bikes").get_frame() # secs = bpd["Days"]*secsPerDay # bpd["Month"] = secs.month().asfactor() # bpd["DayOfWeek"] = secs.dayOfWeek() # wthr1 = h2o.import_file(path=[tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")]) # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]] # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)") # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1") # wthr3 = wthr2[ wthr2["Hour Local"]==12 ] # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"]) # secsPerDay=1000*60*60*24 # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor() # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec") # rain = wthr4["Rain (mm)"] # rain[ rain.isna() ] = 0 # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False) # r = bpd_with_weather['Days'].runif(seed=356964763) # train = bpd_with_weather[ r < 0.6] # test = bpd_with_weather[(0.6 <= r) & (r < 0.9)] predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)'] train = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_train.csv")) test = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_test.csv")) glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def perfectSeparation_unbalanced(): print("Read in synthetic unbalanced dataset") data = h2o.import_file( tests.locate("smalldata/synthetic_perfect_separation/unbalanced.csv")) print("Fit model on dataset.") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, alpha=[0.5], Lambda=[0]) print( "Extract models' coefficients and assert reasonable values (ie. no greater than 50)" ) print("Unbalanced dataset") coef = [ c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept" ] for c in coef: assert c < 50, "coefficient is too large"
def save_load_model(): prostate = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE", "RACE", "PSA", "DCAPS"]], family="binomial", alpha=[0.5]) path = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results")) assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isdir( model_path ), "Expected load directory {0} to exist, but it does not.".format( model_path) the_model = h2o.load_model(model_path) assert isinstance( the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format( the_model)
def link_functions_binomial(): print("Read in prostate data.") h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def benign(ip, port): # Connect to h2o h2o.init(ip, port) training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4, 11) #Log.info("Build the model") model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5]) #Log.info("Check that the columns used in the model are the ones we passed in.") #Log.info("===================Columns passed in: ================") in_names = [training_data.names()[i] for i in X] #Log.info("===================Columns passed out: ================") out_names = [ model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X) + 1) ] assert in_names == out_names[1:]
def link_functions_gaussian(ip,port): print("Read in prostate data.") h2o_data = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")). open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,9] sm_data_features = sm_data[:,1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gaussian(): print("Read in prostate data.") h2o_data = h2o.import_file( path=tests.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian( sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def glm_solvers(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement", "power", "weight", "acceleration", "year"] for solver in [ "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT" ]: print "Solver = {0}".format(solver) for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]: if family == 'binomial': response_col = "economy_20mpg" elif family == 'gaussian': response_col = "economy" else: response_col = "cylinders" print "Family = {0}".format(family) if family == 'binomial': training_data[response_col] = training_data[ response_col].asfactor() else: training_data[response_col] = training_data[ response_col].asnumeric() model = h2o.glm(x=training_data[predictors], y=training_data[response_col], family=family, alpha=[0], Lambda=[1e-5], solver=solver)
def wide_dataset_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1], use_all_factor_levels=True) print("Test model on validation set.") validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def link_functions_binomial(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_tweedie_vpow(): # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names) - set(["y"])) print "Testing for family: TWEEDIE" print "Create models with canonical link: TWEEDIE" # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print "Fit h2o.glm:" h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=[0.5], Lambda=[0]) print "Testing Tweedie variance power: {0}".format(vpow) print "Compare model deviances for link function tweedie" deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance() assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print "compare null and residual deviance between R glm and h2o.glm for tweedie" assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def grid_lambda_search(): # Log.info("Importing prostate.csv data...\n") prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) #prostate.summary() # Log.info("H2O GLM (binomial) with parameters: alpha = c(0.25, 0.5), nlambda = 20, lambda_search = TRUE, nfolds: 2\n") model = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", nlambdas=5, lambda_search=True, n_folds=2) if random.random() < 0.5: model_idx = 0 else: model_idx = 1 model_bestlambda = model.models(model_idx) params_bestlambda = model.params() # Log.info(cat("All lambda values returned:\n", params_bestlambda.lambdas())) assert len(params_bestlambda.lambdas()) <= 5, "expected 5 or less lambdas" random_lambda = random.choice(params_bestlambda.lambdas()) print("RANDOM LAMBDA") print(random_lambda) # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and randomly chosen lambda", random_lambda, "\n")) random_model = model.getGLMLambdaModel(model_bestlambda, random_lambda) # Log.info("EXPECTING THESE TO BE EQUAL") print(random_model.Lambda()) print(random_lambda) assert random_model.Lambda() == random_lambda, "expected lambdas to be equal" # Log.info(cat("Retrieving model corresponding to alpha =", params_bestlambda.alpha(), "and best lambda", params_bestlambda.lambdaBest(), "\n")) best_model = h2o.getGLMLambdaModel(model_bestlambda, params_bestlambda.lambda_best()) assert best_model.model() == model_bestlambda.model(), "expected models to be equal" # Log.info("H2O GLM (binomial) with parameters: alpha = [0.25, 0.5], nlambda = 20, lambda_search = TRUE, nfolds: 2\n") prostate_search = h2o.glm(x=prostate[2:9], y=prostate[1], family="binomial", alpha=[0.25, 0.5], nlambdas=5, lambda_search=True, n_folds=2) model_search = prostate_search.models(model_idx) models_best = model_search.models(model_search.best_model()) params_best = models_best.params() assert params_bestlambda.lambda_best() == params_best.lambda_best(), "expected lambdas to be equal" assert len(params_best.lambda_all()) <= 20, "expected 20 or fewer lambdas"
def link_functions_poisson(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) sm_data = pd.read_csv( zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv" ) ).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: POISSON") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create h2o model with canonical link: LOG") h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="log", alpha=[0.5], Lambda=[0]) print("Create statsmodel model with canonical link: LOG") sm_model_log = sm.GLM( endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson(sm.families.links.log) ).fit() print("Compare model deviances for link function log") h2o_deviance_log = old_div(h2o_model_log.residual_deviance(), h2o_model_log.null_deviance()) sm_deviance_log = old_div(sm_model_log.deviance, sm_model_log.null_deviance) assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create h2o models with link: IDENTITY") h2o_model_id = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="poisson", link="identity", alpha=[0.5], Lambda=[0]) print("Create statsmodel models with link: IDENTITY") sm_model_id = sm.GLM( endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson(sm.families.links.identity) ).fit() print("Compare model deviances for link function identity") h2o_deviance_id = old_div(h2o_model_id.residual_deviance(), h2o_model_id.null_deviance()) sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance) assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
def pubdev_1839(): train = h2o.import_file(pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x =train.drop("bikes"), y =train ["bikes"], validation_x=test .drop("bikes"), validation_y=test ["bikes"], family="poisson")
def pubdev_1839(ip, port): train = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x =train.drop("bikes"), y =train ["bikes"], validation_x=test .drop("bikes"), validation_y=test ["bikes"], Lambda=[1e-5], family="poisson")
def pubdev_1839(): train = h2o.import_file( pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_train.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/jira/pubdev_1839_repro_test.csv")) glm0 = h2o.glm(x=train.drop("bikes"), y=train["bikes"], validation_x=test.drop("bikes"), validation_y=test["bikes"], family="poisson")
def save_load_model(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm, name="delete_model", force=True) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_load_model(ip,port): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def pyunit_make_glm_model(): # TODO: PUBDEV-1717 pros = h2o.import_file(tests.locate("smalldata/prostate/prostate.csv")) model = h2o.glm(x=pros[["AGE","DPROS","DCAPS","PSA","VOL","GLEASON"]], y=pros["CAPSULE"], family="gaussian", alpha=[0]) new_betas = {"AGE":0.5, "DPROS":0.5, "DCAPS":0.5, "PSA":0.5, "VOL":0.5, "GLEASON":0.5} names = '[' for n in new_betas.keys(): names += "\""+n+"\"," names = names[0:len(names)-1]+"]" betas = '[' for b in new_betas.values(): betas += str(b)+"," betas = betas[0:len(betas)-1]+"]" res = h2o.H2OConnection.post_json("MakeGLMModel",model=model._id,names=names,beta=betas)
def prostate(): h2o_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) h2o_data.summary() sm_data = pd.read_csv(pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:, 1] sm_data_features = sm_data[:, 2:] h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5]) sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() print "statsmodels null deviance {0}".format(sm_glm.null_deviance) print "h2o null deviance {0}".format(h2o_glm.null_deviance()) assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
def glm_solvers(): predictors = ["displacement","power","weight","acceleration","year"] for solver in ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"]: print("Solver = {0}".format(solver)) for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]: if family == 'binomial': response_col = "economy_20mpg" elif family == 'gaussian': response_col = "economy" else: response_col = "cylinders" print("Family = {0}".format(family)) training_data = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) if family == 'binomial': training_data[response_col] = training_data[response_col].asfactor() else: training_data[response_col] = training_data[response_col].asnumeric() model = h2o.glm(x=training_data[predictors], y=training_data[response_col], family=family, alpha=[0], Lambda=[1e-5], solver=solver) h2o.remove(training_data)
def perfectSeparation_unbalanced(): print("Read in synthetic unbalanced dataset") data = h2o.import_file(pyunit_utils.locate("smalldata/synthetic_perfect_separation/unbalanced.csv")) print("Fit model on dataset.") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, alpha=[0.5], Lambda=[0]) print("Extract models' coefficients and assert reasonable values (ie. no greater than 50)") print("Unbalanced dataset") coef = [c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept"] for c in coef: assert c < 50, "coefficient is too large"
def prostate(): h2o_data = h2o.upload_file(path=tests.locate("smalldata/logreg/prostate.csv")) h2o_data.summary() sm_data = pd.read_csv(tests.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:,1] sm_data_features = sm_data[:,2:] h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5]) sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() print "statsmodels null deviance {0}".format(sm_glm.null_deviance) print "h2o null deviance {0}".format(h2o_glm.null_deviance()) assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results")) assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def perfectSeparation_balanced(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in synthetic balanced dataset") data = h2o.import_frame(path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv")) print("Fit model on dataset") model = h2o.glm(x=data[["x1", "x2"]], y=data["y"], family="binomial", lambda_search=True, use_all_factor_levels=True, alpha=[0.5], Lambda=[0]) print("Extract models' coefficients and assert reasonable values (ie. no greater than 50)") print("Balanced dataset") coef = [c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept"] for c in coef: assert c < 50, "coefficient is too large"
def benign(): training_data = h2o.import_file(tests.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4,11) #Log.info("Build the model") model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5]) #Log.info("Check that the columns used in the model are the ones we passed in.") #Log.info("===================Columns passed in: ================") in_names = [training_data.names[i] for i in X] #Log.info("===================Columns passed out: ================") out_names = [model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X)+1)] assert in_names == out_names[1:]
def glm_mean_residual_deviance(ip,port): cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" glm = h2o.glm(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) glm_mrd = glm.mean_residual_deviance(train=True,valid=True,xval=True) assert isinstance(glm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(glm_mrd['train'])) assert isinstance(glm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(glm_mrd['valid'])) assert isinstance(glm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(glm_mrd['xval']))