def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) from h2o.estimators.gbm import H2OGradientBoostingEstimator m1 = H2OGradientBoostingEstimator(ntrees=100) m1.train(x=[0,1,2,4],y=3, training_frame=sv1) m2 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id) m2.train([0,1,2,4], y=3, training_frame=sv2) # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail until we figure out proper behavior try: m3 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id) m3.train(x=[0,1,2,4], y=3, training_frame=vir) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir)
def shuffling_large(): print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create second model on original Arcene dataset.") h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create model on shuffled Arcene dataset.") h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) print("checkpoint_new_category_in_predictor-1") m1 = H2ODeepLearningEstimator(epochs=100) m1.train(x=[0,1,2,4], y=3, training_frame=sv1) m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m2.train(x=[0,1,2,4], y=3, training_frame=sv2) print("checkpoint_new_category_in_predictor-2") # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m3.train(x=[0,1,2,4], y=3, training_frame=vir) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass print("checkpoint_new_category_in_predictor-3") # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir) print("checkpoint_new_category_in_predictor-4")
def shuffling_large(ip,port): # Connect to h2o h2o.init(ip,port) print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create second model on original Arcene dataset.") h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Create model on shuffled Arcene dataset.") h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisTest = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader_bad_cnames.csv")) rank = 3 gx = 0.5 gy = 0.5 trans = "STANDARDIZE" print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str( gy) + ", transform = " + trans) glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) print("Impute original data from XY decomposition") # and expect warnings buffer = StringIO() # redirect warning messages to string buffer for later analysis sys.stderr = buffer h2o_pred = glrm_h2o.predict(irisTest) warn_phrase = "UserWarning" warn_string_of_interest = "missing column" sys.stderr = sys.__stderr__ # redirect it back to stdout. try: # for python 2.7 if len(buffer.buflist) > 0: for index in range(len(buffer.buflist)): print("*** captured warning message: {0}".format(buffer.buflist[index])) assert (warn_phrase in buffer.buflist[index]) and (warn_string_of_interest in buffer.buflist[index]) except: # for python 3. warns = buffer.getvalue() print("*** captured warning message: {0}".format(warns)) assert (warn_phrase in warns) and (warn_string_of_interest in warns)
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution) model1.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, checkpoint=restored_model.model_id) model2.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) model3 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model3.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)
def pub_444_spaces_in_filenames(): # tempdir = "smalldata/jira/" # if was okay to write to smalldata, it's okay to write to the current directory # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox? tempdir = "./" # make a few files with spaces in the name f1 = open(pyunit_utils.locate(tempdir) + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0,1) else "0, b\n") f1.close() f2 = open(pyunit_utils.locate(tempdir) + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0,1) else "0, b\n") f2.close() f3 = open(pyunit_utils.locate(tempdir) + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0,1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + "foo .csv")) train_data.show() train_data.describe() train_data["response"] = train_data["response"].asfactor() gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1) gbm.train(x=list(range(1,train_data.ncol)), y="response", training_frame=train_data) gbm.show() train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + "b a r .csv")) train_data.show() train_data.describe() train_data["response"] = train_data["response"].asfactor() gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1) gbm.train(x=1, y="response", training_frame=train_data) gbm.show() train_data = h2o.upload_file(path=pyunit_utils.locate(tempdir + " ba z.csv")) train_data.show() train_data.describe() train_data[0]=train_data[0].asfactor() gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1) gbm.train(x=1, y=0, training_frame=train_data) gbm.show() os.remove(pyunit_utils.locate(tempdir) + "foo .csv") os.remove(pyunit_utils.locate(tempdir) + "b a r .csv") os.remove(pyunit_utils.locate(tempdir) + " ba z.csv")
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, # slow, turn off for real problems seed=1234, ) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest( x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234 ) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp], 0) test_features = test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format( cm.cell_values[10][10] )
def pub_444_spaces_in_filenames(ip,port): # tempdir = "smalldata/jira/" # if was okay to write to smalldata, it's okay to write to the current directory # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox? tempdir = "./" # make a few files with spaces in the name f1 = open(h2o.locate(tempdir) + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0,1) else "0, b\n") f1.close() f2 = open(h2o.locate(tempdir) + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0,1) else "0, b\n") f2.close() f3 = open(h2o.locate(tempdir) + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0,1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=h2o.locate(tempdir + "foo .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate(tempdir + "b a r .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate(tempdir + " ba z.csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() os.remove(h2o.locate(tempdir) + "foo .csv") os.remove(h2o.locate(tempdir) + "b a r .csv") os.remove(h2o.locate(tempdir) + " ba z.csv")
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(0) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.pop(resp) # train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] # train_supervised.describe() # train autoencoder ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[nfeatures], epochs=1, reproducible=True, seed=1234) ae_model.train(list(range(resp)), training_frame=train_unsupervised) # convert train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" train_supervised_features = train_supervised_features.cbind(train_supervised[resp]) # Train DRF on extracted feature space drf_model = H2ORandomForestEstimator(ntrees=10, min_rows=10, seed=1234) drf_model.train(x=list(range(20)), y=train_supervised_features.ncol - 1, training_frame=train_supervised_features) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp], 0) test_features = test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 8.8% error +/- 0.001 # compare to runit_deeplearning_autoencoder_large.py assert abs(cm.cell_values[10][10] - 0.0880) < 0.001, "Error. Expected 0.0880, but got {0}".format( cm.cell_values[10][10] )
def glrm_arrests_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing USArrests.csv data and saving for validation...") arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_full.describe() totobs = arrests_full.nrow * arrests_full.ncol train_err = [0]*len(missing_ratios) valid_err = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio)) arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_miss = arrests_miss.insert_missing_values(fraction=ratio) arrests_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) arrests_glrm = H2OGeneralizedLowRankEstimator(k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6) arrests_glrm.train(x=arrests_miss.names, training_frame=arrests_miss, validation_frame=arrests_full) arrests_glrm.show() # Check imputed data and error metrics glrm_obj = arrests_glrm._model_json['output']['objective'] train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr'] train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj) assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero" assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero" train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data" assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs) train_err[i] = train_numerr valid_err[i] = valid_numerr # h2o.remove(arrests_glrm._model_json['output']['loading_key']['name']) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def javapredict_smallcat(): # optional parameters params = {'epochs':100} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def javapredict_smallcat(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.upload_file(h2o.locate("smalldata/iris/setosa_versicolor.csv")) test = h2o.upload_file(h2o.locate("smalldata/iris/virginica.csv")) x = [0,1,2,4] y = 3 tests.javapredict("random_forest", "numeric", train, test, x, y, **params)
def checkpoint_new_category_in_response(): sv = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) m1 = h2o.deeplearning(x=sv[[0,1,2,3]], y=sv[4], epochs=100) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass
def pub_444_spaces_in_filenames(ip,port): # Connect to h2o h2o.init(ip,port) # make a few files with spaces in the name f1 = open(h2o.locate("smalldata/jira/") + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0,1) else "0, b\n") f1.close() f2 = open(h2o.locate("smalldata/jira/") + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0,1) else "0, b\n") f2.close() f3 = open(h2o.locate("smalldata/jira/") + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0,1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/foo .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/b a r .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=h2o.locate("smalldata/jira/ ba z.csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() os.remove(h2o.locate("smalldata/jira/") + "foo .csv") os.remove(h2o.locate("smalldata/jira/") + "b a r .csv") os.remove(h2o.locate("smalldata/jira/") + " ba z.csv")
def colname_set_basic(ip,port): print "Uploading iris data..." no_headers = h2o.upload_file(h2o.locate("smalldata/iris/iris.csv")) headers_and = h2o.upload_file(h2o.locate("smalldata/iris/iris_header.csv")) print no_headers.names print headers_and.names no_headers.setNames(headers_and.names) assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\ format(no_headers.names, headers_and.names)
def colname_set_basic(): print("Uploading iris data...") no_headers = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) headers_and = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_header.csv")) print(no_headers.names) print(headers_and.names) no_headers.set_names(headers_and.names) assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\ format(no_headers.names, headers_and.names)
def glrm_prostate_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing prostate_cat.csv data and saving for validation...") prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8) prostate_full.describe() totnas = 0 for i in range(prostate_full.ncol): totnas = totnas + prostate_full[i].isna().sum() totobs = prostate_full.nrow * prostate_full.ncol - totnas train_numerr = [0]*len(missing_ratios) valid_numerr = [0]*len(missing_ratios) train_caterr = [0]*len(missing_ratios) valid_caterr = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio)) prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate_miss = prostate_miss.insert_missing_values(fraction=ratio) prostate_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6) prostate_glrm.show() # Check imputed data and error metrics train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt'] valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt'] assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data" assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data" assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs) assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs) train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr'] valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] h2o.remove(prostate_glrm._model_json['output']['representation_name']) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i])) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2ORandomForestEstimator( ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234 ) model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # build the equivalent of model 2 in one shot model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234) model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True) )
def offsets_and_distributions(ip,port): # cars cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)]) offset.setNames(["x1"]) cars = cars.cbind(offset) # insurance insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def glrm_subset(): acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (['enum'] + ['numeric']*149)) acs_full = acs_orig.drop("ZCTA5") acs_model = H2OGeneralizedLowRankEstimator(k = 10, transform = 'STANDARDIZE', loss = 'Quadratic', regularization_x = 'Quadratic', regularization_y = 'L1', gamma_x = 0.25, gamma_y = 0.5, max_iterations = 1) acs_model.train(x = acs_full.names, training_frame= acs_full) zcta_arch_x = h2o.get_frame(acs_model._model_json['output']['representation_name']) print (zcta_arch_x) acs_zcta_col = acs_orig["ZCTA5"].asfactor() idx = ((acs_zcta_col == '10065') | # Manhattan, NY (Upper East Side)\n", (acs_zcta_col == '11219') | # Manhattan, NY (East Harlem)\n", (acs_zcta_col == '66753') | # McCune, KS\n", (acs_zcta_col == '84104') | # Salt Lake City, UT\n", (acs_zcta_col == '94086') | # Sunnyvale, CA\n", (acs_zcta_col == '95014')) # Cupertino, CA\n", print(zcta_arch_x[idx,[0,1]])
def test4(): df = h2o.upload_file(pyunit_utils.locate("smalldata/jira/pubdev_2020.csv")) splits = df.split_frame(ratios=[0.8], destination_frames=["myf0", "myf1"]) part0 = splits[0] assert part0.frame_id == "myf0" part1 = splits[1] assert part1.frame_id == "myf1"
def link_functions_tweedie_vpow(): # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names) - set(["y"])) print("Testing for family: TWEEDIE") print("Create models with canonical link: TWEEDIE") # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print("Fit h2o.glm:") h2ofit = H2OGeneralizedLinearEstimator(family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=0.5, Lambda=0) h2ofit.train(x=x,y=y, training_frame=hdf) print("Testing Tweedie variance power: {0}".format(vpow)) print("Compare model deviances for link function tweedie") deviance_h2o_tweedie = old_div(h2ofit.residual_deviance(), h2ofit.null_deviance()) assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print("compare null and residual deviance between R glm and h2o.glm for tweedie") assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2]) def huber(a): return a*a/2 if abs(a) <= 1 else abs(a)-0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:,3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
def grid_glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() transform_opts = ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"] k_opts = random.sample(list(range(1,8)),3) size_of_hyper_space = len(transform_opts) * len(k_opts) hyper_parameters = OrderedDict() hyper_parameters["k"] = k_opts hyper_parameters["transform"] = transform_opts gx = random.uniform(0,1) gy = random.uniform(0,1) print("H2O GLRM with , gamma_x = " + str(gx) + ", gamma_y = " + str(gy) +\ ", hyperparameters = " + str(hyper_parameters)) gs = H2OGridSearch(H2OGeneralizedLowRankEstimator(loss="Quadratic", gamma_x=gx, gamma_y=gy), hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=irisH2O) for model in gs: assert isinstance(model, H2OGeneralizedLowRankEstimator) print(gs.sort_by("mse")) #print gs.hit_ratio_table() assert len(gs) == size_of_hyper_space total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values())))) for model in gs.models: combo = [model.parms['k']['actual_value']] + [model.parms['transform']['actual_value']] assert combo in total_grid_space total_grid_space.remove(combo)
def offset_bernoulli_cars(): # Connect to a pre-existing cluster cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5 for x in range(398)]]) offset.set_names(["x1"]) cars = cars.cbind(offset) gbm = h2o.gbm(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", ntrees=1, max_depth=1, min_rows=1, learn_rate=1, offset_column="x1", training_frame=cars) predictions = gbm.predict(cars) # Comparison result generated from R's gbm: # gg = gbm(formula = economy_20mpg~cylinders+displacement+power+weight+acceleration+year+offset(rep(.5,398)), # distribution = "bernoulli",data = df,n.trees = 1,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1, # train.fraction = 1,bag.fraction = 1) # pr = predict.gbm(object = gg,newdata = df,n.trees = 1,type = "link") # pr = 1/(1+exp(-df$x1 - pr)) assert abs(-0.1041234 - gbm._model_json['output']['init_f']) < 1e-6, "expected init_f to be {0}, but got {1}". \ format(-0.1041234, gbm._model_json['output']['init_f']) assert abs(0.577326 - predictions[:,2].mean()[0]) < 1e-6, "expected prediction mean to be {0}, but got {1}". \ format(0.577326, predictions[:,2].mean()[0]) assert abs(0.1621461 - predictions[:,2].min()) < 1e-6, "expected prediction min to be {0}, but got {1}". \ format(0.1621461, predictions[:,2].min()) assert abs(0.8506528 - predictions[:,2].max()) < 1e-6, "expected prediction max to be {0}, but got {1}". \ format(0.8506528, predictions[:,2].max())
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns, uuidNames): skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns) skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns) # this two frames should be the same pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5) skipCounter = 0 typeDict = originalFullFrame.types frameNames = originalFullFrame.names for cindex in range(len(frameNames)): if cindex not in skipped_columns: if typeDict[frameNames[cindex]] == u'enum': pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, tol=1e-10, returnResult=False) elif typeDict[frameNames[cindex]] == u'string': pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, returnResult=False) else: pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, tol=1e-10, returnResult=False) skipCounter = skipCounter + 1 # since we cannot check uuid contents, we at least need to know that the return frame contains the correct column names frameNames.extend(uuidNames) skippedFrameNames = skippedFrameIF.names for skipIndex in skipped_columns: assert frameNames[skipIndex] not in skippedFrameNames, \ "This column: {0}/{1} should have been skipped but is not!".format(frameNames[skipIndex], skipIndex)
def link_functions_tweedie_vpow(ip,port): # Connect to h2o h2o.init(ip,port) # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names()) - set(["y"])) print "Testing for family: TWEEDIE" print "Create models with canonical link: TWEEDIE" # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print "Fit h2o.glm:" h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=[0.5], Lambda=[0]) print "Testing Tweedie variance power: {0}".format(vpow) print "Compare model deviances for link function tweedie" deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance() assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print "compare null and residual deviance between R glm and h2o.glm for tweedie" assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def test2(): df = h2o.upload_file(pyunit_utils.locate("smalldata/jira/pubdev_2020.csv")) splits = df.split_frame(ratios=[0.5, 0.25]) assert df.nrow == splits[0].nrow + splits[1].nrow + splits[2].nrow assert splits[0].nrow > 0 assert splits[1].nrow > 0 assert splits[2].nrow > 0
def cars_checkpoint(): cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" # build first model model1 = H2ORandomForestEstimator(ntrees=10,max_depth=2, min_rows=10) model1.train(x=predictors,y=response_col,training_frame=cars) # model1 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=10,max_depth=2, min_rows=10) # continue building the model model2 = H2ORandomForestEstimator(ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8, checkpoint=model1._id) model2.train(x=predictors,y=response_col,training_frame=cars) # model2 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8, # checkpoint=model1._id) # erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS # PUBDEV-1833 # mtries try: model = H2ORandomForestEstimator(mtries=2,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],mtries=2,checkpoint=model1._id) assert False, "Expected model-build to fail because mtries not modifiable by checkpoint" except EnvironmentError: assert True # sample_rate try: model = H2ORandomForestEstimator(sample_rate=0.5,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],sample_rate=0.5,checkpoint=model1._id) assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint" except EnvironmentError: assert True # nbins_cats try: model = H2ORandomForestEstimator(sample_rate=0.5,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins_cats=99,checkpoint=model1._id) assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint" except EnvironmentError: assert True # nbins try: model = H2ORandomForestEstimator(nbins=99,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins=99,checkpoint=model1._id) assert False, "Expected model-build to fail because nbins not modifiable by checkpoint" except EnvironmentError: assert True # balance_classes try: model = H2ORandomForestEstimator(balance_classes=True,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],balance_classes=True,checkpoint=model1._id) assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint" except EnvironmentError: assert True # nfolds try: model = H2ORandomForestEstimator(nfolds=3,checkpoint=model1._id) model.train(x=predictors,y=response_col,training_frame=cars) # model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nfolds=3,checkpoint=model1._id) assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint" except EnvironmentError: assert True
def pub_444_spaces_in_filenames(): # tempdir = "smalldata/jira/" # if was okay to write to smalldata, it's okay to write to the current directory # probably don't want to, but can't find what the standard temp directory is supposed to be. no sandbox? tempdir = "./" # make a few files with spaces in the name f1 = open(tests.locate(tempdir) + "foo .csv", "w") f1.write("response, predictor\n") for i in range(10): f1.write("1, a\n") f1.write("0, b\n") f1.write("1, a\n" if random.randint(0, 1) else "0, b\n") f1.close() f2 = open(tests.locate(tempdir) + "b a r .csv", "w") f2.write("response, predictor\n") for i in range(10): f2.write("1, a\n") f2.write("0, b\n") f2.write("1, a\n" if random.randint(0, 1) else "0, b\n") f2.close() f3 = open(tests.locate(tempdir) + " ba z.csv", "w") for i in range(10): f3.write("1, a\n") f3.write("0, b\n") f3.write("1, a\n" if random.randint(0, 1) else "0, b\n") f3.close() train_data = h2o.upload_file(path=tests.locate(tempdir + "foo .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=tests.locate(tempdir + "b a r .csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data["response"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() train_data = h2o.upload_file(path=tests.locate(tempdir + " ba z.csv")) train_data.show() train_data.describe() gbm = h2o.gbm(x=train_data[1:], y=train_data[0].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() os.remove(tests.locate(tempdir) + "foo .csv") os.remove(tests.locate(tempdir) + "b a r .csv") os.remove(tests.locate(tempdir) + " ba z.csv")
def iris_frame() -> h2o.H2OFrame: frame = h2o.upload_file(_file("iris.csv")) assert frame.names == [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species" ] return frame
def milsong_checkpoint(): milsong_train = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], seed=1234) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir( model_path ), "Expected load directory {0} to exist, but it does not.".format( model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id, seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], seed=1234) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True))
def test_explanation_single_model_regression(): train = h2o.upload_file( pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) y = "fare" # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) gbm = H2OGradientBoostingEstimator(seed=1234, model_id="my_awesome_model") gbm.train(y=y, training_frame=train) # test shap summary assert isinstance( gbm.shap_summary_plot(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test shap explain row assert isinstance( gbm.shap_explain_row_plot(train, 1).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test residual analysis assert isinstance( gbm.residual_analysis_plot(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test pd_plot for col in cols_to_test: try: assert isinstance( gbm.pd_plot(train, col).figure(), matplotlib.pyplot.Figure) except ValueError: assert col == "name", "'name' is a string column which is not supported." # test ICE plot for col in cols_to_test: try: assert isinstance( gbm.ice_plot(train, col).figure(), matplotlib.pyplot.Figure) except ValueError: assert col == "name", "'name' is a string column which is not supported." matplotlib.pyplot.close("all") # test learning curve assert isinstance(gbm.learning_curve_plot().figure(), matplotlib.pyplot.Figure) for metric in ["auto", "deviance", "rmse"]: assert isinstance( gbm.learning_curve_plot(metric=metric.upper()).figure(), matplotlib.pyplot.Figure) assert isinstance( gbm.learning_curve_plot(metric).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(gbm.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(gbm.explain_row(train, 1, render=False), H2OExplanation)
import h2o import imp import math as math from h2o.estimators.kmeans import H2OKMeansEstimator h2o.init() data = h2o.upload_file("/Users/mac/Downloads/versa.csv") df = data.as_data_frame() df.head() data.describe() cols = ['Flow Key','Type','Rule','Source country','Destination country','User','C26'] df.drop(cols, inplace=True, axis=1) df.info() hf = h2o.H2OFrame(df) hf.describe() try: imp.find_module('pandas') can_pandas = True import pandas as pd except: can_pandas = False try: imp.find_module('seaborn') can_seaborn = True import seaborn as sns except: can_seaborn = False
import sys sys.path.insert(1, "../../../") import h2o h2o.init() covtype = h2o.upload_file(h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() #dlmodel = h2o.deeplearning(x=covtype[0:54], y=covtype[54], hidden=[17,191], epochs=1, training_frame=covtype, # balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) predictors = range(100) ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True, hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1) foo = ae_model.anomaly(covtype) print foo # pros = h2o.upload_file(h2o.locate("smalldata/prostate/prostate.csv.zip")) # pros[1] = pros[1].asfactor() # r = pros[0].runif() # a column of length pros.nrow() with values between 0 and 1 # # ~80/20 train/validation split # pros_train = pros[r > .2] # pros_valid = pros[r <= .2]
def missing_frame() -> h2o.H2OFrame: frame = h2o.upload_file(_file("missing.csv")) assert frame.shape == (40, 3) assert frame.names == ["xCat", "xNum", "response"] return frame
from io import StringIO h2o.init() h2o.cluster().timezone = "America/Los_Angeles" # Fetch Airlines Dataset from S3 # Airlines Full Dataset 120 GB data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv" # Airlines all years 1987-2008 12GB data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv" # 2000 Row 4.5 MB data_path = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv" # df = h2o.import_file(data_path) # # Or use local version df = h2o.upload_file("./datasets/airlines-allyears2k.csv") column_names = df.names # Or ingest from Kafka topic DATA_TOPIC = 'airlines_stream' consumer = KafkaConsumer( DATA_TOPIC, # group_id='h2o-airlines-trainer', group_id=None, auto_offset_reset='earliest', value_deserializer=lambda x: x.decode('utf-8')) pandas_dfs = [] # No of messages to be included in the DataFrame n = 3000 i = 0
def random_attack(ip, port): # Connect to h2o h2o.init(ip, port) def attack(train, valid, x, y): kwargs = {} # randomly select parameters and their corresponding values if random.randint(0, 1): kwargs['mtries'] = random.randint(1, len(x)) if random.randint(0, 1): kwargs['sample_rate'] = random.random() if random.randint(0, 1): kwargs['build_tree_one_node'] = True if random.randint(0, 1): kwargs['ntrees'] = random.randint(1, 10) if random.randint(0, 1): kwargs['max_depth'] = random.randint(1, 5) if random.randint(0, 1): kwargs['min_rows'] = random.randint(1, 10) if random.randint(0, 1): kwargs['nbins'] = random.randint(1, 20) if random.randint(0, 1): kwargs['balance_classes'] = True if random.randint(0, 1): kwargs['max_after_balance_size'] = random.uniform(0, 10) if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000) do_validation = [True, False][random.randint(0, 1)] # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) print "y: {0}".format(y) print "validation: {0}".format(do_validation) for k, v in zip(kwargs.keys(), kwargs.values()): print k + ": {0}".format(v) if do_validation: h2o.random_forest(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) else: h2o.random_forest(x=train[x], y=train[y], **kwargs) print "-----------------------" print "Import and data munging..." pros = h2o.upload_file(h2o.locate("smalldata/prostate/prostate.csv.zip")) pros[1] = pros[1].asfactor() pros[4] = pros[4].asfactor() pros[5] = pros[5].asfactor() pros[8] = pros[8].asfactor() r = pros[0].runif( ) # a column of length pros.nrow() with values between 0 and 1 # ~80/20 train/validation split pros_train = pros[r > .2] pros_valid = pros[r <= .2] cars = h2o.upload_file(h2o.locate("smalldata/junit/cars.csv")) r = cars[0].runif() cars_train = cars[r > .2] cars_valid = cars[r <= .2] print print "======================================================================" print "============================== Binomial ==============================" print "======================================================================" for i in range(10): attack(pros_train, pros_valid, random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1) print print "======================================================================" print "============================== Gaussian ==============================" print "======================================================================" for i in range(10): attack(cars_train, cars_valid, random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1) print print "======================================================================" print "============================= Multinomial ============================" print "======================================================================" cars_train[2] = cars_train[2].asfactor() cars_valid[2] = cars_valid[2].asfactor() for i in range(10): attack(cars_train, cars_valid, random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2)
def glrm_pubdev_3756_arrest(): print("Importing prostate.csv data...") # frame binary data is read in as enums. Let's see if it runs. prostateF = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostateF_num = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostateF_num[0] = prostateF_num[0].asnumeric() prostateF_num[4] = prostateF_num[4].asnumeric() loss_all = [ "Hinge", "Quadratic", "Categorical", "Categorical", "Hinge", "Quadratic", "Quadratic", "Quadratic" ] print("check with init = plusplus") glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345) glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF) glrm_h2o.show() # exercise logistic loss with numeric columns glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345) glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num) glrm_h2o_num.show() print("check with init = random") glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="random") glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF) glrm_h2o.show() # exercise logistic loss with numeric columns glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="random") glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num) glrm_h2o_num.show() print("check with init = SVD") glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="SVD") glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF) glrm_h2o.show() # exercise logistic loss with numeric columns glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="SVD") glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num) glrm_h2o_num.show() print("check with init = user") initial_y = [[ -1.27675647831893E-15, 64.87421383647799, 2.0, 1.0, 2.0816681711721685E-16, 8.533270440251574, 9.380440251572328, 5.886792452830188 ], [ 0.7297297297297298, 66.05405405405405, 2.0, 0.0, 1.0, 23.270270270270274, 9.589189189189193, 7.27027027027027 ], [ 0.01754385964912314, 70.35087719298245, 2.0, 1.0, -1.3877787807814457E-17, 10.078947368421053, 42.37543859649123, 6.157894736842105 ], [0.9, 65.95, 2.0, 0.0, 0.2, 81.94500000000001, 16.375, 7.4], [ 0.9999999999999989, 65.48598130841121, 2.0, 3.0, 1.3877787807814457E-16, 13.3092523364486, 13.268411214953275, 6.747663551401869 ]] initial_y_h2o = h2o.H2OFrame(list(initial_y)) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="User", user_y=initial_y_h2o) glrm_h2o.train(x=prostateF.names, training_frame=prostateF, validation_frame=prostateF) glrm_h2o.show() # exercise logistic loss with numeric columns glrm_h2o_num = H2OGeneralizedLowRankEstimator(k=5, loss_by_col=loss_all, recover_svd=True, transform="STANDARDIZE", seed=12345, init="User", user_y=initial_y_h2o) glrm_h2o_num.train(x=prostateF_num.names, training_frame=prostateF_num, validation_frame=prostateF_num) glrm_h2o_num.show() # singular values from glrm models should equal if binary columns with binary loss are read in as either # categorical or numerics. If not, something is wrong. assert pyunit_utils.equal_two_arrays(glrm_h2o._model_json["output"]["singular_vals"], glrm_h2o_num._model_json["output"]["singular_vals"], 1e-6, 1e-4), \ "Singular values obtained from logistic loss with column type as enum and numeric do not agree. Fix it now." sys.stdout.flush()
def stackedensemble_guassian_test(): """This test check the following (for guassian regression): 1) That H2OStackedEnsembleEstimator executes w/o errors on a 3-model manually constructed ensemble. 2) That .predict() works on a stack. 3) That .model_performance() works on a stack. 4) That the training and test performance is better on ensemble vs the base learners. 5) That the validation_frame arg on H2OStackedEnsembleEstimator works correctly. """ col_types = [ "numeric", "numeric", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ] dat = h2o.upload_file( path=pyunit_utils.locate("smalldata/extdata/prostate.csv"), destination_frame="prostate_hex", col_types=col_types) train, test = dat.split_frame(ratios=[.8], seed=1) print(train.summary()) # Identify predictors and response x = ["CAPSULE", "GLEASON", "RACE", "DPROS", "DCAPS", "PSA", "VOL"] y = "AGE" # set number of folds nfolds = 5 # train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator( distribution="gaussian", max_depth=3, learn_rate=0.2, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # evaluate the performance perf_gbm_train = my_gbm.model_performance(train=True) perf_gbm_test = my_gbm.model_performance(test_data=test) print("GBM training performance: ") print(perf_gbm_train) print("GBM test performance: ") print(perf_gbm_test) # train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=30, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # evaluate performance perf_rf_train = my_rf.model_performance(train=True) perf_rf_test = my_rf.model_performance(test_data=test) print("RF training performance: ") print(perf_rf_train) print("RF test performance: ") print(perf_rf_test) # Train and cross-validate an extremely-randomized RF my_xrf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, histogram_type="Random", fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_xrf.train(x=x, y=y, training_frame=train) # evaluate performance perf_xrf_train = my_xrf.model_performance(train=True) perf_xrf_test = my_xrf.model_performance(test_data=test) print("XRF training performance: ") print(perf_xrf_train) print("XRF test performance: ") print(perf_xrf_test) # Train a stacked ensemble using the GBM and GLM above stack = H2OStackedEnsembleEstimator( model_id="my_ensemble_guassian", base_models=[my_gbm.model_id, my_rf.model_id, my_xrf.model_id]) stack.train( x=x, y=y, training_frame=train, validation_frame=test) # also test that validation_frame is working # Check that prediction works pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Does predict() have ugly side effects? pred = stack.predict(test_data=test) assert pred.nrow == test.nrow, "expected " + str( pred.nrow) + " to be equal to " + str(test.nrow) assert pred.ncol == 1, "expected " + str( pred.ncol) + " to be equal to 1 but it was equal to " + str(pred.ncol) # Evaluate ensemble performance perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Does performance() have ugly side effects? perf_stack_train = stack.model_performance() perf_stack_test = stack.model_performance(test_data=test) # Training RMSE for each base learner baselearner_best_rmse_train = min(perf_gbm_train.rmse(), perf_rf_train.rmse(), perf_xrf_train.rmse()) stack_rmse_train = perf_stack_train.rmse() print("Best Base-learner Training RMSE: {0}".format( baselearner_best_rmse_train)) print("Ensemble Training RMSE: {0}".format(stack_rmse_train)) #assert stack_rmse_train < baselearner_best_rmse_train, "expected stack_rmse_train would be less than " \ # " found it wasn't baselearner_best_rmse_train" # Check that stack perf is better (smaller) than the best (smaller) base learner perf: # Test RMSE for each base learner baselearner_best_rmse_test = min(perf_gbm_test.rmse(), perf_rf_test.rmse(), perf_xrf_test.rmse()) stack_rmse_test = perf_stack_test.rmse() print( "Best Base-learner Test RMSE: {0}".format(baselearner_best_rmse_test)) print("Ensemble Test RMSE: {0}".format(stack_rmse_test)) assert stack_rmse_test < baselearner_best_rmse_test, "expected stack_rmse_test would be less than " \ " baselearner_best_rmse_test, found it wasn't " \ "baselearner_best_rmse_test = "+ \ str(baselearner_best_rmse_test) + ",stack_rmse_test " \ " = "+ str(stack_rmse_test) # Check that passing `test` as a validation_frame produces the same metric as stack.model_performance(test) # since the metrics object is not exactly the same, we can just test that RSME is the same perf_stack_validation_frame = stack.model_performance(valid=True) assert stack_rmse_test == perf_stack_validation_frame.rmse(), "expected stack_rmse_test to be the same as " \ "perf_stack_validation_frame.rmse() found they were not " \ "perf_stack_validation_frame.rmse() = " + \ str(perf_stack_validation_frame.rmse()) + \ "stack_rmse_test was " + str(stack_rmse_test)
def pca_scoring_history_importance(): """ This test aims to check and make sure PCA returns the scoring history and importance which are reported missing for certain PCA mode. Apart from changing the PCA mode, I throw in the transform type to test as well randomly. """ transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("Importing australia.csv data...\n") australia = h2o.upload_file( pyunit_utils.locate("smalldata/extdata/australia.csv")) col_indices = list(range(0, australia.ncol)) print("transform is {0}.\n".format(transformN)) # checking out PCA with GramSVD print("@@@@@@ Building PCA with GramSVD...\n") gramSVD = H2OPCA(k=3, transform=transformN) gramSVD.train(x=col_indices, training_frame=australia) # check PCA with PCA set to Randomized print("@@@@@@ Building PCA with Randomized...\n") randomizedPCA = H2OPCA(k=3, transform=transformN, pca_method="Randomized", compute_metrics=True, use_all_factor_levels=True) randomizedPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-3) print("@@@@@@ Comparing eigenvectors between GramSVD and Randomized...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=5e-2, check_sign=True) # check PCA with PCA set to Power print("@@@@@@ Building PCA with Power...\n") powerPCA = H2OPCA(k=3, transform=transformN, pca_method="Power", compute_metrics=True, use_all_factor_levels=True) powerPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-5, check_sign=True) # check PCA with PCA set to GLRM print("@@@@@@ Building PCA with GLRM...\n") glrmPCA = H2OPCA(k=3, transform=transformN, pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) glrmPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-2) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=1e-1, check_sign=True) # make sure we find the scoring history and it is not empty for all the PCA modes # just check and make sure the cell_values exceed 0 assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GramSVD is empty." assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to using is empty." assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to Randomized is " \ "empty." assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GLRM is empty."
def random_attack(): def attack(family, train, valid, x, y): kwargs = {} kwargs['family'] = family gaussian_links = ["inverse", "log", "identity"] binomial_links = ["logit"] poisson_links = ["log", "identity"] gamma_links = ["inverse", "log", "identity"] # randomly select parameters and their corresponding values if random.randint(0, 1): kwargs['max_iterations'] = random.randint(1, 50) if random.random() > 0.8: kwargs['beta_epsilon'] = random.random() if random.randint(0, 1): kwargs['solver'] = [ "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT" ][random.randint(0, 1)] if random.randint(0, 1): kwargs['standardize'] = [True, False][random.randint(0, 1)] if random.randint(0, 1): if family == "gaussian": kwargs['link'] = gaussian_links[random.randint(0, 2)] elif family == "binomial": kwargs['link'] = binomial_links[random.randint(0, 0)] elif family == "poisson": kwargs['link'] = poisson_links[random.randint(0, 1)] elif family == "gamma": kwargs['link'] = gamma_links[random.randint(0, 2)] if random.randint(0, 1): kwargs['alpha'] = [random.random()] if family == "binomial": if random.randint(0, 1): kwargs['prior'] = random.random() if random.randint(0, 1): kwargs['lambda_search'] = [True, False][random.randint(0, 1)] if 'lambda_search' in list(kwargs.keys()): if random.randint(0, 1): kwargs['nlambdas'] = random.randint(2, 10) do_validation = [True, False][random.randint(0, 1)] # beta constraints if random.randint(0, 1): bc = [] for n in x: if train[n].isnumeric(): name = train.names[n] lower_bound = random.uniform(-1, 1) upper_bound = lower_bound + random.random() bc.append([name, lower_bound, upper_bound]) if len(bc) > 0: beta_constraints = h2o.H2OFrame(bc) beta_constraints.set_names( ['names', 'lower_bounds', 'upper_bounds']) kwargs['beta_constraints'] = beta_constraints.frame_id # display the parameters and their corresponding values print("-----------------------") print("x: {0}".format(x)) print("y: {0}".format(y)) print("validation: {0}".format(do_validation)) for k, v in zip(list(kwargs.keys()), list(kwargs.values())): if k == 'beta_constraints': print(k + ": ") beta_constraints.show() else: print(k + ": {0}".format(v)) if do_validation: # h2o.glm(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) H2OGeneralizedLinearEstimator(**kwargs).train( x=x, y=y, training_frame=train, validation_frame=valid) else: # h2o.glm(x=train[x], y=train[y], **kwargs) H2OGeneralizedLinearEstimator(**kwargs).train(x=x, y=y, training_frame=train) print("-----------------------") print("Import and data munging...") seed = random.randint(1, 10000) print("SEED: {0}".format(seed)) pros = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate.csv.zip")) pros[1] = pros[1].asfactor() r = pros[0].runif( seed=seed) # a column of length pros.nrow with values between 0 and 1 # ~80/20 train/validation split pros_train = pros[r > .2] pros_valid = pros[r <= .2] cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars.csv")) r = cars[0].runif(seed=seed) cars_train = cars[r > .2] cars_valid = cars[r <= .2] print() print( "======================================================================" ) print( "============================== Binomial ==============================" ) print( "======================================================================" ) for i in range(10): attack("binomial", pros_train, pros_valid, random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1) print() print( "======================================================================" ) print( "============================== Gaussian ==============================" ) print( "======================================================================" ) for i in range(10): attack("gaussian", cars_train, cars_valid, random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1) print() print( "======================================================================" ) print( "============================== Poisson ==============================" ) print( "======================================================================" ) for i in range(10): attack("poisson", cars_train, cars_valid, random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2) print() print( "======================================================================" ) print( "============================== Gamma ==============================" ) print( "======================================================================" ) for i in range(10): attack("gamma", pros_train, pros_valid, random.sample([1, 2, 3, 5, 6, 7, 8], random.randint(1, 7)), 4)
import h2o h2o.init() #load data train_set = h2o.upload_file("train.csv") test_set = h2o.upload_file("test.csv") #Define X and y y = "label" X = list(set(train_set.col_names) - set(["label"])) train_set[y] = train_set[y].asfactor() from h2o.estimators import H2ODeepLearningEstimator from h2o.grid.grid_search import H2OGridSearch #grid search and k-fold hidden_opt = [[32, 32], [32, 16, 8], [100]] l1_opt = [1e-4, 1e-3] hyper_parameters = {"hidden": hidden_opt, "l1": l1_opt} model_grid = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters) model_grid.train(x=X, y=y, distribution="multinomial", epochs=1000, training_frame=train_set, nfolds=5, stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification") #get the best model gs = model_grid.sort_by("mse") best = h2o.get_model( "Grid_DeepLearning_py_2_model_python_1459310941902_2_model_4") pred = best.predict(test_set)
def impute2(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) prostate = h2o.upload_file( h2o.locate("smalldata/logreg/prostate_missing.csv")) methods = ["mean", "median", "mode"] combine_methods = ["interpolate", "average", "low", "high"] inplace = [False, True] for inpl in inplace: for method in methods: for combine_method in combine_methods: h2o.impute(prostate, "DPROS", method=method, combine_method=combine_method, inplace=inpl) air = h2o.upload_file( h2o.locate("smalldata/airlines/allyears2k_headers.zip")) for inpl in inplace: for method in methods: for combine_method in combine_methods: if method == "mode": h2o.impute(air, "TailNum", method=method, combine_method=combine_method, inplace=inpl) else: try: h2o.impute(air, "TailNum", method=method, combine_method=combine_method, inplace=inpl) assert False, "only \"mode\" method allowed for categorical column, but {0} was allowed here".\ format(method) except ValueError: assert True data = [[None, 2, 3, 1, 'a', 1, 9], [1, None, 4, 2, 'a', 1, 9], [2, 3, None, None, 'b', 1, 9], [3, 4, None, None, 'b', 3, 8], [4, 5, 9, 5, None, 2, 8], [5, None, 10, 7, 'b', None, 8]] h2o_data = h2o.H2OFrame(python_obj=data) # mean check h2o.impute(h2o_data, column="C1", method="mean") c1_imputed = h2o_data[0, 0] assert c1_imputed == 3, "Wrong value imputed. Expected imputed value of 3, but got {0}".format( c1_imputed) # inplace check h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C1", method="mean", inplace=False) assert h2o_data["C1"].isna().sum( ) == 1, "Expected imputation to be done in place." # median-average h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C2", method="median", combine_method="average") c2_imputed = h2o_data[1, 1] assert c2_imputed == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format( c2_imputed) # median-low h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C3", method="median", combine_method="low") c3_imputed = h2o_data[2, 2] assert c3_imputed == 4, "Wrong value imputed. Expected imputed value of 4, but got {0}".format( c3_imputed) # median-high h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C4", method="median", combine_method="high") c4_imputed = h2o_data[2, 3] assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format( c4_imputed) # mode-categorical h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C5", method="mode") c5_imputed = h2o_data[4, 4] assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format( c5_imputed) # mode-numeric h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C6", method="mode") c6_imputed = h2o_data[5, 5] assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format( c6_imputed) # mean-group by C7 h2o_data = h2o.H2OFrame(python_obj=data) h2o.impute(h2o_data, column="C3", method="mean", by=["C7"]) imputed1 = h2o_data[2, 2] imputed2 = h2o_data[3, 2] assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format( imputed1) assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format( imputed2)
def glrm_arrests_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing USArrests.csv data and saving for validation...") arrests_full = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_full.describe() totobs = arrests_full.nrow * arrests_full.ncol train_err = [0] * len(missing_ratios) valid_err = [0] * len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing USArrests.csv and inserting {0}% missing entries". format(100 * ratio)) arrests_miss = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_miss = arrests_miss.insert_missing_values(fraction=ratio) arrests_miss.describe() print("H2O GLRM with {0}% missing entries".format(100 * ratio)) arrests_glrm = H2OGeneralizedLowRankEstimator(k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6) arrests_glrm.train(x=arrests_miss.names, training_frame=arrests_miss, validation_frame=arrests_full) arrests_glrm.show() # Check imputed data and error metrics glrm_obj = arrests_glrm._model_json['output']['objective'] train_numerr = arrests_glrm._model_json['output'][ 'training_metrics']._metric_json['numerr'] train_caterr = arrests_glrm._model_json['output'][ 'training_metrics']._metric_json['caterr'] valid_numerr = arrests_glrm._model_json['output'][ 'validation_metrics']._metric_json['numerr'] valid_caterr = arrests_glrm._model_json['output'][ 'validation_metrics']._metric_json['caterr'] assert abs(train_numerr - glrm_obj ) < 1e-3, "Numeric error on training data was " + str( train_numerr ) + " but should equal final objective " + str(glrm_obj) assert train_caterr == 0, "Categorical error on training data was " + str( train_caterr) + " but should be zero" assert valid_caterr == 0, "Categorical error on validation data was " + str( valid_caterr) + " but should be zero" train_numcnt = arrests_glrm._model_json['output'][ 'training_metrics']._metric_json['numcnt'] valid_numcnt = arrests_glrm._model_json['output'][ 'validation_metrics']._metric_json['numcnt'] assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data" assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str( valid_numcnt) + " but should be " + str(totobs) train_err[i] = train_numerr valid_err[i] = valid_numerr # h2o.remove(arrests_glrm._model_json['output']['loading_key']['name']) for i in range(len(missing_ratios)): print( "Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}" .format(missing_ratios[i] * 100, train_err[i], valid_err[i]))
def glrm_prostate_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print "Importing prostate_cat.csv data and saving for validation..." prostate_full = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"] * 8) prostate_full.describe() totnas = 0 for i in range(prostate_full.ncol): totnas = totnas + prostate_full[i].isna().sum() totobs = prostate_full.nrow * prostate_full.ncol - totnas train_numerr = [0] * len(missing_ratios) valid_numerr = [0] * len(missing_ratios) train_caterr = [0] * len(missing_ratios) valid_caterr = [0] * len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print "Importing prostate_cat.csv and inserting {0}% missing entries".format( 100 * ratio) prostate_miss = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate_miss = prostate_miss.insert_missing_values(fraction=ratio) prostate_miss.describe() print "H2O GLRM with {0}% missing entries".format(100 * ratio) prostate_glrm = H2OGeneralizedLowRankEstimator(k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6) prostate_glrm.train(x=range(8), training_frame=prostate_miss, validation_frame=prostate_full) prostate_glrm.show() # Check imputed data and error metrics train_numcnt = prostate_glrm._model_json['output'][ 'training_metrics']._metric_json['numcnt'] valid_numcnt = prostate_glrm._model_json['output'][ 'validation_metrics']._metric_json['numcnt'] train_catcnt = prostate_glrm._model_json['output'][ 'training_metrics']._metric_json['catcnt'] valid_catcnt = prostate_glrm._model_json['output'][ 'validation_metrics']._metric_json['catcnt'] assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data" assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data" assert ( train_numcnt + valid_numcnt ) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format( train_numcnt + valid_numcnt, totobs) assert ( valid_numcnt + valid_catcnt ) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format( valid_numcnt + valid_catcnt, totobs) train_numerr[i] = prostate_glrm._model_json['output'][ 'training_metrics']._metric_json['numerr'] valid_numerr[i] = prostate_glrm._model_json['output'][ 'validation_metrics']._metric_json['numerr'] train_caterr[i] = prostate_glrm._model_json['output'][ 'training_metrics']._metric_json['caterr'] valid_caterr[i] = prostate_glrm._model_json['output'][ 'validation_metrics']._metric_json['caterr'] # h2o.remove(prostate_glrm._model_json['output']['loading_key']['name']) for i in range(len(missing_ratios)): print "Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format( missing_ratios[i] * 100, train_numerr[i], valid_numerr[i]) for i in range(len(missing_ratios)): print "Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format( missing_ratios[i] * 100, train_caterr[i], valid_caterr[i])
return "../examples/smalldata/" + example_name conf = SparkConf().setIfMissing("spark.master", os.getenv("spark.master", "local[*]")) spark = SparkSession.builder.appName("ChicagoCrimeTest").config( conf=conf).getOrCreate() # Start H2O services h2oContext = H2OContext.getOrCreate(spark) # Define file names chicagoAllWeather = "chicagoAllWeather.csv" chicagoCensus = "chicagoCensus.csv" chicagoCrimes10k = "chicagoCrimes10k.csv" # h2o.import_file expects cluster-relative path f_weather = h2o.upload_file(_locate(chicagoAllWeather)) f_census = h2o.upload_file(_locate(chicagoCensus)) f_crimes = h2o.upload_file(_locate(chicagoCrimes10k), col_types={"Date": "string"}) # Transform weather table # Remove 1st column (date) f_weather = f_weather[1:] # Transform census table # Remove all spaces from column names (causing problems in Spark SQL) col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names) # Update column names in the table # f_weather.names = col_names
def gbm_demo(interactive, echo, test): h2o_data_path = system_file("prostate.csv") demo_description = ['\n-----------------------------------------------------------------', 'This is a demo of H2O\'s GBM function.', 'It uploads a dataset to h2o, parses it, and shows a description.', 'Then, it divides the dataset into training and test sets, ', 'builds a GBM from the training set, and predicts on the test set.', 'Finally, default performance metrics are displayed.', '-----------------------------------------------------------------'] demo_commands = ['# Connect to h2o', '>>> h2o.init()\n', '\n# Upload the prostate dataset that comes included in the h2o python package', '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n', '\n# Print a description of the prostate data', '>>> prostate.summary()\n', '\n# Randomly split the dataset into ~70/30, training/test sets', '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]', '>>> valid = prostate[r >= 0.30]\n', '\n# Convert the response columns to factors (for binary classification problems)', '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()', '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n', '\n# Build a (classification) GBM', '>>> prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], ' 'y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, ' 'learn_rate=0.2)\n', '\n# Show the model', '>>> prostate_gbm.show()\n', '\n# Predict on the test set and show the first ten predictions', '>>> predictions = prostate_gbm.predict(test)', '>>> predictions.show()\n', '\n# Show default performance metrics', '>>> performance = prostate_gbm.model_performance(test)', '>>> performance.show()\n'] for line in demo_description: print line print echo_and_interact(demo_commands, interactive, echo) if not test: h2o.init() echo_and_interact(demo_commands, interactive, echo) prostate = h2o.upload_file(path = h2o_data_path) echo_and_interact(demo_commands, interactive, echo) prostate.summary() echo_and_interact(demo_commands, interactive, echo, npop=4) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] echo_and_interact(demo_commands, interactive, echo, npop=3) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() echo_and_interact(demo_commands, interactive, echo) prostate_gbm = h2o.gbm(x=train[["AGE", "RACE", "PSA", "VOL", "GLEASON"]], y=train["CAPSULE"], distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) echo_and_interact(demo_commands, interactive, echo) prostate_gbm.show() echo_and_interact(demo_commands, interactive, echo, npop=3) predictions = prostate_gbm.predict(test) predictions.show() echo_and_interact(demo_commands, interactive, echo, npop=3) performance = prostate_gbm.model_performance(test) performance.show()
def deeplearning_demo(interactive, echo, test): h2o_data_path = system_file("prostate.csv") demo_description = ['\n-----------------------------------------------------------------', 'This is a demo of H2O\'s Deeplearning function.', 'It uploads a dataset to h2o, parses it, and shows a description.', 'Then, it divides the dataset into training and test sets, ', 'builds a model from the training set, and predicts on the test set.', 'Finally, default performance metrics are displayed.', '-----------------------------------------------------------------'] demo_commands = ['# Connect to h2o', '>>> h2o.init()\n', '\n# Upload the prostate dataset that comes included in the h2o python package', '>>> prostate = h2o.upload_file(path = ' + h2o_data_path + '))\n', '\n# Print a description of the prostate data', '>>> prostate.summary()\n', '\n# Randomly split the dataset into ~70/30, training/test sets', '>>> r = prostate[0].runif()', '>>> train = prostate[r < 0.70]', '>>> valid = prostate[r >= 0.30]\n', '\n# Convert the response columns to factors (for binary classification problems)', '>>> train["CAPSULE"] = train["CAPSULE"].asfactor()', '>>> test["CAPSULE"] = test["CAPSULE"].asfactor()\n', '\n# Build a (classification) Deeplearning model', '>>> prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))]' ', y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000)\n', '\n# Show the model', '>>> prostate_dl.show()\n', '\n# Predict on the test set and show the first ten predictions', '>>> predictions = prostate_dl.predict(test)', '>>> predictions.show()\n', '\n# Show default performance metrics', '>>> performance = prostate_dl.model_performance(test)', '>>> performance.show()\n'] for line in demo_description: print line print echo_and_interact(demo_commands, interactive, echo) if not test: h2o.init() echo_and_interact(demo_commands, interactive, echo) prostate = h2o.upload_file(path = h2o_data_path) echo_and_interact(demo_commands, interactive, echo) prostate.summary() echo_and_interact(demo_commands, interactive, echo, npop=4) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] echo_and_interact(demo_commands, interactive, echo, npop=3) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() echo_and_interact(demo_commands, interactive, echo) prostate_dl = h2o.deeplearning(x=train[list(set(prostate.col_names)-set(["ID","CAPSULE"]))], y=train["CAPSULE"], activation="Tanh", hidden=[10, 10, 10], epochs=10000) echo_and_interact(demo_commands, interactive, echo) prostate_dl.show() echo_and_interact(demo_commands, interactive, echo, npop=3) predictions = prostate_dl.predict(test) predictions.show() echo_and_interact(demo_commands, interactive, echo, npop=3) performance = prostate_dl.model_performance(test) performance.show()
def weights_and_biases(): print( "Test checks if Deep Learning weights and biases are accessible from R" ) covtype = h2o.upload_file( pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() dlmodel = H2ODeepLearningEstimator(hidden=[17, 191], epochs=1, balance_classes=False, reproducible=True, seed=1234, export_weights_and_biases=True) dlmodel.train(x=list(range(54)), y=54, training_frame=covtype) print(dlmodel) weights1 = dlmodel.weights(0) weights2 = dlmodel.weights(1) weights3 = dlmodel.weights(2) biases1 = dlmodel.biases(0) biases2 = dlmodel.biases(1) biases3 = dlmodel.biases(2) w1c = weights1.ncol w1r = weights1.nrow assert w1c == 52, "wrong dimensionality! expected {0}, but got {1}.".format( 52, w1c) assert w1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format( 17, w1r) w2c = weights2.ncol w2r = weights2.nrow assert w2c == 17, "wrong dimensionality! expected {0}, but got {1}.".format( 17, w2c) assert w2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format( 191, w2r) w3c = weights3.ncol w3r = weights3.nrow assert w3c == 191, "wrong dimensionality! expected {0}, but got {1}.".format( 191, w3c) assert w3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format( 7, w3r) b1c = biases1.ncol b1r = biases1.nrow assert b1c == 1, "wrong dimensionality! expected {0}, but got {1}.".format( 1, b1c) assert b1r == 17, "wrong dimensionality! expected {0}, but got {1}.".format( 17, b1r) b2c = biases2.ncol b2r = biases2.nrow assert b2c == 1, "wrong dimensionality! expected {0}, but got {1}.".format( 1, b2c) assert b2r == 191, "wrong dimensionality! expected {0}, but got {1}.".format( 191, b2r) b3c = biases3.ncol b3r = biases3.nrow assert b3c == 1, "wrong dimensionality! expected {0}, but got {1}.".format( 1, b3c) assert b3r == 7, "wrong dimensionality! expected {0}, but got {1}.".format( 7, b3r) df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) dl1 = H2ODeepLearningEstimator(hidden=[10, 10], export_weights_and_biases=True) dl1.train(x=list(range(4)), y=4, training_frame=df) p1 = dl1.predict(df) ll1 = dl1.model_performance(df).logloss() print(ll1) ## get weights and biases w1 = dl1.weights(0) w2 = dl1.weights(1) w3 = dl1.weights(2) b1 = dl1.biases(0) b2 = dl1.biases(1) b3 = dl1.biases(2) ## make a model from given weights/biases dl2 = H2ODeepLearningEstimator(hidden=[10, 10], initial_weights=[w1, w2, w3], initial_biases=[b1, b2, b3], epochs=0) dl2.train(x=list(range(4)), y=4, training_frame=df) p2 = dl2.predict(df) ll2 = dl2.model_performance(df).logloss() print(ll2) # h2o.download_pojo(dl2) ## fully functional pojo ## check consistency assert abs(p1[:, 1:4] - p2[:, 1:4]).max() < 1e-6 assert abs(ll2 - ll1) < 1e-6 ## make another model with partially set weights/biases dl3 = H2ODeepLearningEstimator(hidden=[10, 10], initial_weights=[w1, None, w3], initial_biases=[b1, b2, None], epochs=10) dl3.train(x=list(range(4)), y=4, training_frame=df) ll3 = dl3.model_performance(df).logloss() ## make another model with partially set user-modified weights/biases dl4 = H2ODeepLearningEstimator( hidden=[10, 10], initial_weights=[w1 * 1.1, w2 * 0.9, w3.sqrt()], initial_biases=[b1, b2, None], epochs=10) dl4.train(x=list(range(4)), y=4, training_frame=df) ll4 = dl4.model_performance(df).logloss()
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(0) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.pop(resp) #train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] #train_supervised.describe() # train autoencoder ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[nfeatures], model_id="ae_model", epochs=1, ignore_const_cols=False, reproducible=True, seed=1234) ae_model.train(list(range(resp)), training_frame=train_unsupervised) # convert train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" train_supervised_features = train_supervised_features.cbind( train_supervised[resp]) # Train DRF on extracted feature space drf_model = H2ORandomForestEstimator(ntrees=10, min_rows=10, seed=1234) drf_model.train(x=list(range(20)), y=train_supervised_features.ncol - 1, training_frame=train_supervised_features) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp], 0) test_features = test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 8.8% error +/- 1% #compare to runit_deeplearning_autoencoder_large.py assert abs(cm.cell_values[10][10] - 0.088) < 0.01, "Error. Expected 0.088, but got {0}".format( cm.cell_values[10][10]) ## Another usecase: Use pretrained unsupervised autoencoder model to initialize a supervised Deep Learning model pretrained_model = H2ODeepLearningEstimator( activation="Tanh", hidden=[nfeatures], epochs=1, reproducible=True, seed=1234, ignore_const_cols=False, pretrained_autoencoder="ae_model") pretrained_model.train(list(range(resp)), resp, training_frame=train_supervised, validation_frame=test_hex) print(pretrained_model.logloss(train=False, valid=True)) model_from_scratch = H2ODeepLearningEstimator(activation="Tanh", hidden=[nfeatures], epochs=1, reproducible=True, seed=1234, ignore_const_cols=False) model_from_scratch.train(list(range(resp)), resp, training_frame=train_supervised, validation_frame=test_hex) print(model_from_scratch.logloss(train=False, valid=True)) assert pretrained_model.logloss( train=False, valid=True ) < model_from_scratch.logloss( train=False, valid=True ), "Error. Pretrained model should lead to lower logloss than training from scratch."
def glrm_set_loss_by_col_rand(): NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"] CAT_LOSS = ["Categorical", "Ordinal"] NUM_COLS = [1, 5, 6, 7] CAT_COLS = [0, 2, 3, 4] print "Importing prostate_cat.csv data..." prostateH2O = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"] * 8) prostateH2O.describe() # Fully specify every column's loss function (no need for loss_by_col_idx) loss_all = [ rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS, k=1)[0] for x in xrange(0, 8) ] print "Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]" glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) glrm_h2o.show() # Randomly set columns and loss functions cat_size = rd.sample(xrange(1, 5), 1) num_size = rd.sample(xrange(1, 5), 1) cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False) num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False) loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True) loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True) loss_idx_all = cat_idx.tolist() + num_idx.tolist() loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist() loss_combined = zip( loss_all, loss_idx_all) # Permute losses and indices in same way for testing rd.shuffle(loss_combined) loss_all[:], loss_idx_all[:] = zip(*loss_combined) if (len(loss_all) < prostateH2O.ncol): try: m = H2OGeneralizedLowRankEstimator h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) assert False, "Expected GLRM to throw error since column indices not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all) assert False, "Expected GLRM to throw error since losses for columns not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal", "Huber"], loss_by_col_idx=[1, 2]) assert False, "Expected GLRM to throw error since not all column indices specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal"], loss_by_col_idx=[1, 2, 5]) assert False, "Expected GLRM to throw error since not all losses for columns specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8) assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(NUM_LOSS, 1), loss_by_col_idx=rd.sample(CAT_COLS, 1)) assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(CAT_LOSS, 1), loss_by_col_idx=rd.sample(NUM_COLS, 1)) assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column" except: pass print "Run GLRM with loss_by_col = [" + ', '.join( loss_all) + "] and loss_by_col_idx = [" + ', '.join( [str(a) for a in loss_idx_all]) + "]" glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all, loss_by_col_idx=loss_idx_all) glrm_h2o.show()
def impute2(): # Connect to a pre-existing cluster prostate = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate_missing.csv")) methods = ["mean", "median", "mode"] combine_methods = ["interpolate", "average", "low", "high"] inplace = [False, True] for method in methods: for combine_method in combine_methods: prostate.impute("DPROS", method=method, combine_method=combine_method) # air = h2o.upload_file(pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip")) # for inpl in inplace: # for method in methods: # for combine_method in combine_methods: # air.impute( "TailNum", method = method, combine_method = combine_method) data = [[None, 2, 3, 1, 'a', 1, 9], [1, None, 4, 2, 'a', 1, 9], [2, 3, None, None, 'b', 1, 9], [3, 4, None, None, 'b', 3, 8], [4, 5, 9, 5, None, 2, 8], [5, None, 10, 7, 'b', None, 8]] h2o_data = h2o.H2OFrame(zip(*data)) # mean check h2o_data = h2o_data.impute(column="C1", method="mean") c1_imputed = h2o_data[0, 0] assert c1_imputed == 3, "Wrong value imputed. Expected imputed value of 3, but got {0}".format( c1_imputed) # inplace check h2o_data = h2o.H2OFrame(zip(*data)) h2o_data.impute(column="C1", method="mean") assert h2o_data["C1"].isna().sum( ) == 1, "Expected imputation to be done in place." # median-average h2o_data = h2o.H2OFrame(zip(*data)) h2o_data = h2o_data.impute(column="C2", method="median", combine_method="average") c2_imputed = h2o_data[1, 1] assert c2_imputed == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format( c2_imputed) # median-low h2o_data = h2o.H2OFrame(zip(*data)) h2o_data = h2o_data.impute(column="C3", method="median", combine_method="low") c3_imputed = h2o_data[2, 2] assert c3_imputed == 4, "Wrong value imputed. Expected imputed value of 4, but got {0}".format( c3_imputed) # median-high h2o_data = h2o.H2OFrame(zip(*data)) h2o_data = h2o_data.impute(column="C4", method="median", combine_method="high") c4_imputed = h2o_data[2, 3] assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format( c4_imputed) # mode-categorical h2o_data = h2o.H2OFrame.from_python(zip(*data), na_strings=['']) h2o_data = h2o_data.impute(column="C5", method="mode") c5_imputed = h2o_data[4, 4] assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format( c5_imputed) # mode-numeric h2o_data = h2o.H2OFrame(zip(*data)) h2o_data = h2o_data.impute(column="C6", method="mode") c6_imputed = h2o_data[5, 5] assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format( c6_imputed) # mean-group by C7 h2o_data = h2o.H2OFrame(zip(*data)) h2o_data = h2o_data.impute(column="C3", method="mean", by="C7") imputed1 = h2o_data[2, 2] imputed2 = h2o_data[3, 2] assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format( imputed1) assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format( imputed2)
def test_explanation_list_of_models_regression(): train = h2o.upload_file( pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) y = "fare" # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [ h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False) ] # Test named models as well gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model") gbm.train(y=y, training_frame=train) models += [gbm] # test variable importance heatmap plot assert isinstance( h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance( h2o.model_correlation_heatmap(models, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: try: assert isinstance( h2o.pd_multi_plot(models, train, col).figure(), matplotlib.pyplot.Figure) except ValueError: assert col == "name", "'name' is a string column which is not supported." matplotlib.pyplot.close("all") # test learning curve for model in models: assert isinstance(model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
def test_explanation_automl_regression(): train = h2o.upload_file(pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) train["name"] = train["name"].asfactor() y = "fare" # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.varimp(use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.model_correlation(train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame) # test partial dependences for col in cols_to_test: try: assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure) except ValueError: assert col == "name", "'name' is a string column which is not supported." matplotlib.pyplot.close("all") # test explain assert isinstance(aml.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation) # test shortening model ids work correctly from h2o.explanation._explain import _shorten_model_ids model_ids = aml.leaderboard.as_data_frame()["model_id"] shortened_model_ids = _shorten_model_ids(model_ids) assert len(set(model_ids)) == len(set(shortened_model_ids)) for i in range(len(model_ids)): assert len(model_ids[i]) > len(shortened_model_ids[i]) # Leaderboard slices work # test explain assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, 1, render=False), H2OExplanation)
def test_explanation_automl_binomial_classification(): train = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.varimp(use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.model_correlation(train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame) # test partial dependences for col in cols_to_test: assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test explain assert isinstance(aml.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation) # Leaderboard slices work # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() leaderboard_without_SE = aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :] assert len(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(h2o.model_correlation_heatmap(leaderboard_without_SE, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=True), pandas.DataFrame) # test partial dependences assert isinstance(h2o.pd_multi_plot(leaderboard_without_SE, train, cols_to_test[0]).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test explain assert isinstance(h2o.explain(leaderboard_without_SE, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(leaderboard_without_SE, train, 1, render=False), H2OExplanation)
def pca_max_k(): data = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(data.names) - y) pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100) pcaGramSVD.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( pcaGramSVD._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1) correctEigNum = pcaPower.full_parameters["k"]["actual_value"] gramSVDNum = len( pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1 powerNum = len( pcaPower._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + gramSVDNum + "." assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + powerNum + "." # Randomized and GLRM does not have wide dataset implementation. Check with smaller datasets data = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) x = list(set(data.names)) pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", impute_missing=True, max_iterations=100, seed=12345) pcaRandomized.train(x, training_frame=data) # should still work with rank deficient dataset pcaRandomizedF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaRandomizedF.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # should still work with rank deficient dataset pcaPowerF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaPowerF.train(x, training_frame=data) # eigenvalues between the PCA and Randomize should be close, I hope... print( "@@@@@@ Comparing eigenvalues between Randomized and Power PCA...\n") pyunit_utils.assert_H2OTwoDimTable_equal( pcaRandomized._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope... print( "@@@@@@ Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( pcaRandomizedF._model_json["output"]["importance"], pcaPowerF._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ]) pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, max_iterations=100, seed=12345) pcaGLRM.train(x, training_frame=data) correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"] glrmNum = len( pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + glrmNum + "."
def cars_checkpoint(ip,port): cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print "\n*** Description (chunk distribution, etc) of training frame:" train.describe() print "\n*** Description (chunk distribution, etc) of validation frame:" valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3),1)[0] # pick the predictors and response column, along with the correct predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2 : response_col = "cylinders" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else : response_col = "economy" print "\n*** Response column: {0}".format(response_col) # build first model ntrees1 = 5 max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "\n*** Building model 1 with the following parameters:" print "*** ntrees model 1: {0}".format(ntrees1) print "*** max_depth model 1: {0}".format(max_depth1) print "*** min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], seed=1234) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print "\n*** Continuing to build model 1 (now called model 2) with the following parameters:" print "*** ntrees model 2: {0}".format(ntrees2) print "*** max_depth model 2: {0}".format(max_depth2) print "*** min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id, seed=1234) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print "\n*** Continuing to build model 1 (now called model 3) with the following parameters:" print "*** ntrees model 3: {0}".format(ntrees3) print "*** max_depth model 3: {0}".format(max_depth3) print "*** min_rows model 3: {0}".format(min_rows3) model3 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id, seed=1234) # build the equivalent of model 2 in one shot print "\n*** Building the equivalent of model 2 (called model 4) in one shot:" model4 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], seed=1234) print "\n*** Model Summary for model 2:" print model2.summary() print "\n*** Model Summary for model 3:" print model3.summary() print "\n*** Model Summary for model 4:" print model4.summary() print "\n*** Score History for model 2:" print model2.score_history() print "\n*** Score History for model 3:" print model3.score_history() print "\n*** Score History for model 4:" print model4.score_history() # checks if problem == 0: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2,type(model4)) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))