def milsong_checkpoint(ip, port): milsong_train = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], seed=1234) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id, seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], seed=1234) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True))
def attack(train, valid, x, y): kwargs = {} # randomly select parameters and their corresponding values if random.randint(0,1): kwargs['mtries'] = random.randint(1,len(x)) if random.randint(0,1): kwargs['sample_rate'] = random.random() if random.randint(0,1): kwargs['build_tree_one_node'] = True if random.randint(0,1): kwargs['ntrees'] = random.randint(1,10) if random.randint(0,1): kwargs['max_depth'] = random.randint(1,5) if random.randint(0,1): kwargs['min_rows'] = random.randint(1,10) if random.randint(0,1): kwargs['nbins'] = random.randint(1,20) if random.randint(0,1): kwargs['balance_classes'] = True if random.randint(0,1): kwargs['max_after_balance_size'] = random.uniform(0,10) if random.randint(0,1): kwargs['seed'] = random.randint(1,10000) do_validation = [True, False][random.randint(0,1)] # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) print "y: {0}".format(y) print "validation: {0}".format(do_validation) for k, v in zip(kwargs.keys(), kwargs.values()): print k + ": {0}".format(v) if do_validation: h2o.random_forest(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) else: h2o.random_forest(x=train[x], y=train[y], **kwargs) print "-----------------------"
def imbalanced(): covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3) imbalanced_perf = imbalanced.model_performance(covtype) imbalanced_perf.show() balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3) balanced_perf = balanced.model_performance(covtype) balanced_perf.show() ##compare error for class 6 (difficult minority) class_6_err_imbalanced = imbalanced_perf.confusion_matrix().cell_values[5][7] class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
def swpredsRF(ip, port): # Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file( path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O DRF without Noise Column #Log.info("Distributed Random Forest with only Predictor Column") model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model1.show() perf1 = model1.model_performance(swpreds) print(perf1.auc()) # Train H2O DRF Model including Noise Column: #Log.info("Distributed Random Forest including Noise Column") model2 = h2o.random_forest(x=swpreds[["X1", "X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model2.show() perf2 = model2.model_performance(swpreds) print(perf2.auc())
def swpredsRF(): # Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O DRF without Noise Column #Log.info("Distributed Random Forest with only Predictor Column") model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model1.show() perf1 = model1.model_performance(swpreds) print(perf1.auc()) # Train H2O DRF Model including Noise Column: #Log.info("Distributed Random Forest including Noise Column") model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model2.show() perf2 = model2.model_performance(swpreds) print(perf2.auc())
def attack(train, valid, x, y): kwargs = {} # randomly select parameters and their corresponding values if random.randint(0, 1): kwargs['mtries'] = random.randint(1, len(x)) if random.randint(0, 1): kwargs['sample_rate'] = random.random() if random.randint(0, 1): kwargs['build_tree_one_node'] = True if random.randint(0, 1): kwargs['ntrees'] = random.randint(1, 10) if random.randint(0, 1): kwargs['max_depth'] = random.randint(1, 5) if random.randint(0, 1): kwargs['min_rows'] = random.randint(1, 10) if random.randint(0, 1): kwargs['nbins'] = random.randint(1, 20) if random.randint(0, 1): kwargs['balance_classes'] = True if random.randint(0, 1): kwargs['max_after_balance_size'] = random.uniform(0, 10) if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000) do_validation = [True, False][random.randint(0, 1)] # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) print "y: {0}".format(y) print "validation: {0}".format(do_validation) for k, v in zip(kwargs.keys(), kwargs.values()): print k + ": {0}".format(v) if do_validation: h2o.random_forest(x=train[x], y=train[y], validation_x=valid[x], validation_y=valid[y], **kwargs) else: h2o.random_forest(x=train[x], y=train[y], **kwargs) print "-----------------------"
def check_same(data1, data2): rf1_regression = h2o.random_forest(x=data1[2:20], y=data1[1]) rf2_regression = h2o.random_forest(x=data2[2:21], y=data2[1], weights_column="weights") rf1_binomial = h2o.random_forest(x=data1[1:20], y=data1[0]) rf2_binomial = h2o.random_forest(x=data2[1:21], y=data2[0], weights_column="weights") assert abs(rf1_regression.mse() - rf2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(rf1_regression.mse(), rf2_regression.mse()) assert abs(rf1_binomial.auc() - rf2_binomial.auc()) < 1e-6, "Expected auc's to be the same, but got {0}, and " \ "{1}".format(rf1_binomial.auc(), rf2_binomial.auc())
def iris_nfolds(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can specify both nfolds >= 2 and validation = H2OParsedData at once try: h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5) assert True except EnvironmentError: assert False, "expected an error"
def iris_nfolds(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can't specify both nfolds >= 2 and validation = H2OParsedData at once try: h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5) assert False, "expected an error" except EnvironmentError: assert True
def iris_nfolds(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can specify both nfolds >= 2 and validation = H2OParsedData at once try: h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5) assert True except EnvironmentError: assert False, "expected an error"
def iris_ignore(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris2.csv")) for maxx in range(4): model = h2o.random_forest(y=iris[4], x=iris[range(maxx + 1)], ntrees=50, max_depth=100) model.show()
def iris_nfolds(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can specify both nfolds >= 2 and validation = H2OParsedData at once try: H2ORandomForestEstimator(ntrees=50, nfolds=5).train(y=4, x=list(range(4)), validation_frame=iris) assert True except EnvironmentError: assert False, "expected an error" if __name__ == "__main__": pyunit_utils.standalone_test(iris_nfolds) else: iris_nfolds()
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures( train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.081) < 0.001, "Error. Expected 0.081, but got {0}".format( cm.cell_values[10][10])
def fiftycatRF(ip, port): # Training set has only 45 categories cat1 through cat45 # Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() # Log.info("Summary of 50_cattest_train.csv from H2O:\n") # train.summary() # Train H2O DRF Model: # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 # Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) # Log.info("Summary of 50_cattest_test.csv from H2O:\n") # test.summary() # Predict on test dataset with DRF model: # Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC # Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def vi_toy_test(ip, port): # Connect to h2o h2o.init(ip, port) toy_data = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv")) #toy_data.summary() toy_data[6] = toy_data[6].asfactor() toy_data.show() rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0) ranking = [ rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol() - 1) ] print(ranking) assert tuple(ranking) == tuple( ["V3", "V2", "V6", "V5", "V1", "V4"]), "expected specific variable importance ranking"
def bigcatRF(ip, port): # Connect to h2o h2o.init(ip, port) # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 # Categories cat002, cat004, ... are perfect predictors of y = 0 #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O DRF Model: #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n") model = h2o.random_forest(x=bigcat[["X"]], y=bigcat["y"], ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.show()
def cars_checkpoint(): cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" # build first model model1 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=10,max_depth=2, min_rows=10) # continue building the model model2 = h2o.random_forest(x=cars[predictors],y=cars[response_col],ntrees=11,max_depth=3, min_rows=9,r2_stopping=0.8, checkpoint=model1._id) # erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS # PUBDEV-1833 # mtries try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],mtries=2,checkpoint=model1._id) assert False, "Expected model-build to fail because mtries not modifiable by checkpoint" except EnvironmentError: assert True # sample_rate try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],sample_rate=0.5,checkpoint=model1._id) assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint" except EnvironmentError: assert True # nbins_cats try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins_cats=99,checkpoint=model1._id) assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint" except EnvironmentError: assert True # nbins try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nbins=99,checkpoint=model1._id) assert False, "Expected model-build to fail because nbins not modifiable by checkpoint" except EnvironmentError: assert True # balance_classes try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],balance_classes=True,checkpoint=model1._id) assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint" except EnvironmentError: assert True # nfolds try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors],nfolds=3,checkpoint=model1._id) assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint" except EnvironmentError: assert True
def imbalanced(ip, port): # Connect to h2o h2o.init(ip, port) covtype = h2o.import_frame( path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=False, nfolds=10) imbalanced_perf = imbalanced.model_performance(covtype) imbalanced_perf.show() balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=True, nfolds=10) balanced_perf = balanced.model_performance(covtype) balanced_perf.show() ##compare error for class 6 (difficult minority) ##confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)] ##Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62 class_6_err_imbalanced = imbalanced_perf.error()[6] class_6_err_balanced = balanced_perf.error()[6] if (class_6_err_imbalanced < class_6_err_balanced): print("--------------------") print("") print("FAIL, balanced error greater than imbalanced error") print("") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!"
def train(self, x, y): self.model = h2o.random_forest(x = self.trainData.drop('score diff'), y = self.trainData['score diff'], validation_x = self.valData.drop('score diff'), validation_y = self.valData['score diff'], ntrees=self.params[2], max_depth=self.params[3], nfolds=self.params[4])
def iris_all(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris2.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100) model.show()
def iris_all(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris2.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100) model.show()
def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv")) m1 = h2o.random_forest(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100) m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id) # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail until we figure out proper behavior try: m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass
def hexdev_422(): fr = h2o.import_file(h2o.locate("bigdata/laptop/jira/z_repro.csv.gz")) fr[0] = fr[0].asfactor() rf = h2o.random_forest(x=fr[1:fr.ncol], y=fr[0], min_rows=1, ntrees=25, max_depth=45) h2o.download_pojo(rf)
def iris_all(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris2.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, max_depth=100) model.show()
def ntrain(): h2o.init(ip="zurich.h2o.ai",strict_version_check=False) weather = load_weather() training = load_training() X = assemble_X(training, weather) mean, std = normalize(X) y =assemble_y(training) xd=[] for l in X: xd.append(l.tolist()) y=np.asarray(y,dtype='bool_') xtr=H2OFrame(python_obj=xd) ytr=H2OFrame(python_obj=y.tolist()) ytr["C1"]._name = "C40" # Rename the default column gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'], distribution = "bernoulli", ntrees=1000, # 500 works well max_depth=12, learn_rate=0.01) dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'], variable_importances=True,balance_classes=True, input_dropout_ratio=0.2,rho=0.899, hidden_dropout_ratios=[0.4,0.4,0.4,0.4], activation="Tanh",hidden=[39,325,325,1],epochs=100) rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'], seed=1234, ntrees=600, max_depth=20, balance_classes=False) testing = load_testing() X_test= assemble_X(testing, weather) normalize(X_test, mean, std) xd=[] for l in X_test: xd.append(l.tolist()) xts=H2OFrame(python_obj=xd) # gp=gb.predict(xts) dp=dl.predict(xts) rp=rf.predict(xts) gbp=gb.predict(xts) gp=dp*0.35+rp*0.3+gbp*0.35 gph=h2o.as_list(gp) Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1) df = pd.DataFrame(Id) df_concat = pd.concat([df, gph.True],axis=1) df_concat.columns=['Id','WnvPresent'] df_concat.to_csv("wnvh.csv",index=False)
def iris_ignore(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris2.csv")) for maxx in range(4): model = h2o.random_forest(y=iris[4], x=iris[range(maxx+1)], ntrees=50, max_depth=100) model.show()
def iris_get_model(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50) model.show() model = h2o.get_model(model._id) model.show()
def deeplearning_autoencoder(): resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) train_hex[resp] = train_hex[resp].asfactor() test_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) test_hex[resp] = test_hex[resp].asfactor() # split data into two parts sid = train_hex[0].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.drop(resp) train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised[0:resp], activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, # slow, turn off for real problems seed=1234, ) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp], 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest( x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234 ) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp], 0) test_features = test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.086) < 0.001, "Error. Expected 0.086, but got {0}".format( cm.cell_values[10][10] )
def iris_ignore(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris2.csv")) for maxx in range(4): model = h2o.random_forest(y=iris[4], x=iris[range(maxx + 1)], ntrees=50, max_depth=100) model.show()
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame( h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning( x=train_unsupervised.drop(resp), y=train_unsupervised[resp], #ignored (pick any non-constant) activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol( ) == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def javapredict(algo, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) else: raise(ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id)) os.makedirs(tmpdir) h2o.download_pojo(model,path=tmpdir) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv")) print "Setting up for Java POJO" h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv")) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(os.path.join(tmpdir,"in.csv"), 'r+') in_csv = f.read() in_csv = re.sub('\"', '', in_csv) f.seek(0) f.write(in_csv) f.truncate() f.close() subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT) subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT) predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv")) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if algo == "gbm": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif algo == "random_forest": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise(ValueError, "algo {0} is not supported".format(algo))
def iris_nfolds_getModel(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() model = h2o.getModel(model._key) model.show()
def iris_nfolds_getModel(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() model = h2o.getModel(model._key) model.show()
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(list(range(50,100)),1)[0] max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("ntrees model 1: {0}".format(ntrees1)) print("max_depth model 1: {0}".format(max_depth1)) print("min_rows model 1: {0}".format(min_rows1)) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print("ntrees model 2: {0}".format(ntrees2)) print("max_depth model 2: {0}".format(max_depth2)) print("min_rows model 2: {0}".format(min_rows2)) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def deeplearning_autoencoder(ip, port): h2o.init(ip, port) resp = 784 nfeatures = 20 # number of features (smallest hidden layer) train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz")) test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz")) # split data into two parts sid = train_hex[1].runif(1234) # unsupervised data for autoencoder train_unsupervised = train_hex[sid >= 0.5] train_unsupervised.describe() # supervised data for drf train_supervised = train_hex[sid < 0.5] train_supervised.describe() # train autoencoder ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp), y=train_unsupervised[resp], #ignored (pick any non-constant) activation="Tanh", autoencoder=True, hidden=[nfeatures], epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised, 0) train_supervised_features.describe() assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features, y=train_supervised[resp].asfactor(), ntrees=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex.drop(resp), 0) test_features.cbind(test_hex[resp]) # Confusion Matrix and assertion cm = drf_model.confusionMatrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
def vi_reg(): data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv")) #data.summary() rf = h2o.random_forest(x=data[0:13], y=data[13], ntrees=100, max_depth=20, nbins=100, seed=0) ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(data.ncol-1)] print(ranking) assert tuple([ranking[0],ranking[1]]) == tuple(["rm","lstat"]), "expected specific variable importance ranking"
def vi_reg(ip,port): data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/BostonHousing.csv")) #data.summary() rf = h2o.random_forest(x=data[0:13], y=data[13], ntrees=100, max_depth=20, nbins=100, seed=0) ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(data.ncol-1)] print(ranking) assert tuple([ranking[0],ranking[1]]) == tuple(["rm","lstat"]), "expected specific variable importance ranking"
def imbalanced(ip, port): # Connect to h2o h2o.init(ip, port) covtype = h2o.import_frame( path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3) imbalanced_perf = imbalanced.model_performance(covtype) imbalanced_perf.show() balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, nfolds=3) balanced_perf = balanced.model_performance(covtype) balanced_perf.show() ##compare error for class 6 (difficult minority) class_6_err_imbalanced = imbalanced_perf.confusion_matrix( ).cell_values[5][7] class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!"
def czechboardRF(ip,port): # Connect to h2o h2o.init(ip,port) # Training set has checkerboard pattern board = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv")) board["C3"] = board["C3"].asfactor() board.summary() # Train H2O DRF Model: model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500) model.show()
def czechboardRF(ip,port): # Training set has checkerboard pattern board = h2o.import_file(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv")) board["C3"] = board["C3"].asfactor() board.summary() # Train H2O DRF Model: model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500) model.show()
def smallcatRF(ip, port): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 # Connect to h2o h2o.init(ip, port) #Log.info("Importing alphabet_cattest.csv data...\n") alphabet = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt( h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0: lambda s: ord(s.split("\"")[1])}) trainDataResponse = trainData[:, 1] trainDataFeatures = trainData[:, 0] # Train H2O GBM Model: #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") rf_h2o = h2o.random_forest(x=alphabet[['X']], y=alphabet["y"], ntrees=1, max_depth=1, nbins=100) # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") rf_sci = ensemble.RandomForestClassifier(n_estimators=1, criterion='entropy', max_depth=1) rf_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse) # h2o rf_perf = rf_h2o.model_performance(alphabet) auc_h2o = rf_perf.auc() # scikit auc_sci = roc_auc_score( trainDataResponse, rf_sci.predict_proba(trainDataFeatures[:, np.newaxis])[:, 1]) #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def imbalanced(ip,port): # Connect to h2o h2o.init(ip,port) covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() imbalanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=False, nfolds=10) imbalanced_perf = imbalanced.model_performance(covtype) imbalanced_perf.show() balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=50, balance_classes=True, nfolds=10) balanced_perf = balanced.model_performance(covtype) balanced_perf.show() ##compare error for class 6 (difficult minority) ##confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)] ##Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62 class_6_err_imbalanced = imbalanced_perf.error()[6] class_6_err_balanced = balanced_perf.error()[6] if (class_6_err_imbalanced < class_6_err_balanced): print("--------------------") print("") print("FAIL, balanced error greater than imbalanced error") print("") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!"
def vi_toy_test(ip,port): toy_data = h2o.import_file(path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv")) #toy_data.summary() toy_data[6] = toy_data[6].asfactor() toy_data.show() rf = h2o.random_forest(x=toy_data[[0,1,2,3,4,5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0) ranking = [rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol-1)] print(ranking) assert tuple(ranking) == tuple(["V3","V2","V6","V5","V1","V4"]), "expected specific variable importance ranking"
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model model_path = h2o.save_model(model1,force=True) restored_model = h2o.load_model(model_path) shutil.rmtree(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def czechboardRF(ip,port): # Connect to h2o h2o.init(ip,port) # Training set has checkerboard pattern #Log.info("Importing czechboard_300x300.csv data...\n") board = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/czechboard_300x300.csv")) board["C3"] = board["C3"].asfactor() #Log.info("Summary of czechboard_300x300.csv from H2O:\n") #board.summary() # Train H2O DRF Model: #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n") model = h2o.random_forest(x=board[["C1", "C2"]], y=board["C3"], ntrees=50, max_depth=20, nbins=500) model.show()
def bigcatRF(): # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 # Categories cat002, cat004, ... are perfect predictors of y = 0 # Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_file(path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() # Log.info("Summary of bigcat_5000x2.csv from H2O:\n") # bigcat.summary() # Train H2O DRF Model: # Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n") model = h2o.random_forest(x=bigcat[["X"]], y=bigcat["y"], ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.show()
def rf_mean_residual_deviance(ip,port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" rf = h2o.random_forest(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) rf_mrd = rf.mean_residual_deviance(train=True,valid=True,xval=True) assert isinstance(rf_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['train'])) assert isinstance(rf_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['valid'])) assert isinstance(rf_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['xval']))
def rf_mean_residual_deviance(ip, port): cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy" rf = h2o.random_forest(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) rf_mrd = rf.mean_residual_deviance(train=True, valid=True, xval=True) assert isinstance(rf_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['train'])) assert isinstance(rf_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['valid'])) assert isinstance(rf_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(rf_mrd['xval']))
def fiftycatRF(ip, port): # Connect to h2o h2o.init(ip, port) # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O DRF Model: #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_frame( path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with DRF model: #Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrices() print(cm)
def weights_vi(ip, port): # Connect to h2o h2o.init(ip, port) ###### create synthetic dataset1 with 3 predictors: p1 predicts response ~90% of the time, p2 ~70%, p3 ~50% response = ['a' for y in range(10000)] [response.append('b') for y in range(10000)] p1 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.9 else 1) for y in response] p2 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.7 else 1) for y in response] p3 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.5 else 1) for y in response] dataset1_python = [[r, one, two, three] for r, one, two, three in zip(response, p1, p2, p3)] dataset1_h2o = h2o.H2OFrame(python_obj=dataset1_python) dataset1_h2o.setNames(["response", "p1", "p2", "p3"]) ##### create synthetic dataset2 with 3 predictors: p3 predicts response ~90% of the time, p1 ~70%, p2 ~50% response = ['a' for y in range(10000)] [response.append('b') for y in range(10000)] p1 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.7 else 1) for y in response] p2 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.5 else 1) for y in response] p3 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.9 else 1) for y in response] dataset2_python = [[r, one, two, three] for r, one, two, three in zip(response, p1, p2, p3)] dataset2_h2o = h2o.H2OFrame(python_obj=dataset2_python) dataset2_h2o.setNames(["response", "p1", "p2", "p3"]) ##### compute variable importances on dataset1 and dataset2 model_dataset1 = h2o.random_forest(x=dataset1_h2o[["p1", "p2", "p3"]], y=dataset1_h2o["response"]) varimp_dataset1 = tuple( [p[0] for p in model_dataset1.varimp(return_list=True)]) assert varimp_dataset1 == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on dataset1: " \ "('p1', 'p2', 'p3'), but got: {0}".format(varimp_dataset1) model_dataset2 = h2o.random_forest(x=dataset2_h2o[["p1", "p2", "p3"]], y=dataset2_h2o["response"]) varimp_dataset2 = tuple( [p[0] for p in model_dataset2.varimp(return_list=True)]) assert varimp_dataset2 == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on dataset2: " \ "('p3', 'p1', 'p2'), but got: {0}".format(varimp_dataset2) ############ Test1 ############# ##### weight the combined dataset 80/20 in favor of dataset 1 dataset1_python_weighted = copy.deepcopy(dataset1_python) [r.append(0.8) for r in dataset1_python_weighted] dataset2_python_weighted = copy.deepcopy(dataset2_python) [r.append(0.2) for r in dataset2_python_weighted] ##### combine dataset1 and dataset2 combined_dataset_python = [] [combined_dataset_python.append(r) for r in dataset1_python_weighted] [combined_dataset_python.append(r) for r in dataset2_python_weighted] combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python) combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"]) ##### recompute the variable importances. the relative order should be the same as above. model_combined_dataset = h2o.random_forest( x=combined_dataset_h2o[["p1", "p2", "p3"]], y=combined_dataset_h2o["response"], training_frame=combined_dataset_h2o, weights_column="weights") varimp_combined = tuple( [p[0] for p in model_combined_dataset.varimp(return_list=True)]) assert varimp_combined == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on the combined " \ "dataset: ('p1', 'p2', 'p3'), but got: {0}".format(varimp_combined) ############ Test2 ############# ##### weight the combined dataset 80/20 in favor of dataset 2 dataset1_python_weighted = copy.deepcopy(dataset1_python) [r.append(0.2) for r in dataset1_python_weighted] dataset2_python_weighted = copy.deepcopy(dataset2_python) [r.append(0.8) for r in dataset2_python_weighted] ##### combine dataset1 and dataset2 combined_dataset_python = [] [combined_dataset_python.append(r) for r in dataset1_python_weighted] [combined_dataset_python.append(r) for r in dataset2_python_weighted] combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python) combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"]) ##### recompute the variable importances. the relative order should be the same as above. model_combined_dataset = h2o.random_forest( x=combined_dataset_h2o[["p1", "p2", "p3"]], y=combined_dataset_h2o["response"], training_frame=combined_dataset_h2o, weights_column="weights") varimp_combined = tuple( [p[0] for p in model_combined_dataset.varimp(return_list=True)]) assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \ "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined)
def cars_checkpoint(ip, port): cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy" # build first model model1 = h2o.random_forest(x=cars[predictors], y=cars[response_col], ntrees=10, max_depth=2, min_rows=10) # continue building the model model2 = h2o.random_forest(x=cars[predictors], y=cars[response_col], ntrees=11, max_depth=3, min_rows=9, r2_stopping=0.8, checkpoint=model1._id) # erroneous, not MODIFIABLE_BY_CHECKPOINT_FIELDS # PUBDEV-1833 # mtries try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], mtries=2, checkpoint=model1._id) assert False, "Expected model-build to fail because mtries not modifiable by checkpoint" except EnvironmentError: assert True # sample_rate try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], sample_rate=0.5, checkpoint=model1._id) assert False, "Expected model-build to fail because sample_rate not modifiable by checkpoint" except EnvironmentError: assert True # nbins_cats try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], nbins_cats=99, checkpoint=model1._id) assert False, "Expected model-build to fail because nbins_cats not modifiable by checkpoint" except EnvironmentError: assert True # nbins try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], nbins=99, checkpoint=model1._id) assert False, "Expected model-build to fail because nbins not modifiable by checkpoint" except EnvironmentError: assert True # balance_classes try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], balance_classes=True, checkpoint=model1._id) assert False, "Expected model-build to fail because balance_classes not modifiable by checkpoint" except EnvironmentError: assert True # nfolds try: model = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=3, checkpoint=model1._id) assert False, "Expected model-build to fail because nfolds not modifiable by checkpoint" except EnvironmentError: assert True
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", model._id)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, model._id + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv), "Expected file {0} to exist, but it does not.".format( out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv), "Expected file {0} to exist, but it does not.".format( out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))
def cv_carsRF(): # read in the dataset and construct training set (and validation set) cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3), 1)[0] problem = 2 # pick the predictors and the correct response column predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2: response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else: response_col = "economy" print "Response column: {0}".format(response_col) ## cross-validation # 1. check that cv metrics are the same over repeated seeded "Modulo" runs nfolds = random.randint(3, 10) rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234) rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234) pyunit_utils.check_models(rf1, rf2, True) # 2. check that cv metrics are different over repeated "Random" runs nfolds = random.randint(3, 10) rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random") try: pyunit_utils.check_models(rf1, rf2, True) assert False, "Expected models to be different over repeated Random runs" except AssertionError: assert True # 3. folds_column num_folds = random.randint(2, 5) fold_assignments = h2o.H2OFrame(python_obj=[[ random.randint(0, num_folds - 1) for f in range(cars.nrow) ]]) fold_assignments.set_names(["fold_assignments"]) cars = cars.cbind(fold_assignments) rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], training_frame=cars, fold_column="fold_assignments", keep_cross_validation_predictions=True) num_cv_models = len(rf._model_json['output']['cross_validation_models']) assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \ "{1}".format(num_folds, num_cv_models) cv_model1 = h2o.get_model( rf._model_json['output']['cross_validation_models'][0]['name']) cv_model2 = h2o.get_model( rf._model_json['output']['cross_validation_models'][1]['name']) # 4. keep_cross_validation_predictions cv_predictions = rf1._model_json['output']['cross_validation_predictions'] assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format( cv_predictions) cv_predictions = rf._model_json['output']['cross_validation_predictions'] assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \ "as folds, but got {0}".format(len(cv_predictions)) ## boundary cases # 1. nfolds = number of observations (leave-one-out cross-validation) rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, fold_assignment="Modulo") # 2. nfolds = 0 rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=0, seed=1234) # check that this is equivalent to no nfolds rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], seed=1234) pyunit_utils.check_models(rf1, rf2) # 3. cross-validation and regular validation attempted rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), validation_y=cars[response_col], validation_x=cars[predictors]) ## error cases # 1. nfolds == 1 or < 0 try: rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1, 1], 1)[0]) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow + 1, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True
def cars_checkpoint(): cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print("\n*** Description (chunk distribution, etc) of training frame:") train.describe() print("\n*** Description (chunk distribution, etc) of validation frame:") valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)),1)[0] # pick the predictors and response column, along with the correct predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2 : response_col = "cylinders" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else : response_col = "economy" print("\n*** Response column: {0}".format(response_col)) # build first model ntrees1 = 5 max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("\n*** Building model 1 with the following parameters:") print("*** ntrees model 1: {0}".format(ntrees1)) print("*** max_depth model 1: {0}".format(max_depth1)) print("*** min_rows model 1: {0}".format(min_rows1)) model1 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], seed=1234) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:") print("*** ntrees model 2: {0}".format(ntrees2)) print("*** max_depth model 2: {0}".format(max_depth2)) print("*** min_rows model 2: {0}".format(min_rows2)) model2 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id, seed=1234) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:") print("*** ntrees model 3: {0}".format(ntrees3)) print("*** max_depth model 3: {0}".format(max_depth3)) print("*** min_rows model 3: {0}".format(min_rows3)) model3 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], checkpoint=restored_model._id, seed=1234) # build the equivalent of model 2 in one shot print("\n*** Building the equivalent of model 2 (called model 4) in one shot:") model4 = h2o.random_forest(x=train[predictors], y=train[response_col], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, score_each_iteration=True, validation_x=valid[predictors], validation_y=valid[response_col], seed=1234) print("\n*** Model Summary for model 2:") print(model2.summary()) print("\n*** Model Summary for model 3:") print(model3.summary()) print("\n*** Model Summary for model 4:") print(model4.summary()) print("\n*** Score History for model 2:") print(model2.score_history()) print("\n*** Score History for model 3:") print(model3.score_history()) print("\n*** Score History for model 4:") print(model4.score_history()) # checks if problem == 0: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2,type(model4)) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
def weights_vi(): ###### create synthetic dataset1 with 3 predictors: p1 predicts response ~90% of the time, p2 ~70%, p3 ~50% response = ['a'] * 10000 + ['b'] * 10000 p1 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.9 else 1) for y in response] p2 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.7 else 1) for y in response] p3 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.5 else 1) for y in response] dataset1_python = [response, p1, p2, p3] dataset1_h2o = h2o.H2OFrame(dataset1_python) dataset1_h2o.set_names(["response", "p1", "p2", "p3"]) ##### create synthetic dataset2 with 3 predictors: p3 predicts response ~90% of the time, p1 ~70%, p2 ~50% response = ['a' for y in range(10000)] [response.append('b') for y in range(10000)] p1 = [(1 if random.uniform(0, 1) < 0.7 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.7 else 1) for y in response] p2 = [(1 if random.uniform(0, 1) < 0.5 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.5 else 1) for y in response] p3 = [(1 if random.uniform(0, 1) < 0.9 else 0) if y == 'a' else (0 if random.uniform(0, 1) < 0.9 else 1) for y in response] dataset2_python = [response, p1, p2, p3] dataset2_h2o = h2o.H2OFrame(dataset2_python) dataset2_h2o.set_names(["response", "p1", "p2", "p3"]) ##### compute variable importances on dataset1 and dataset2 model_dataset1 = h2o.random_forest(x=dataset1_h2o[["p1", "p2", "p3"]], y=dataset1_h2o["response"]) varimp_dataset1 = tuple([p[0] for p in model_dataset1.varimp()]) assert varimp_dataset1 == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on dataset1: " \ "('p1', 'p2', 'p3'), but got: {0}".format(varimp_dataset1) model_dataset2 = h2o.random_forest(x=dataset2_h2o[["p1", "p2", "p3"]], y=dataset2_h2o["response"]) varimp_dataset2 = tuple([p[0] for p in model_dataset2.varimp()]) assert varimp_dataset2 == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on dataset2: " \ "('p3', 'p1', 'p2'), but got: {0}".format(varimp_dataset2) ############ Test1 ############# ##### weight the combined dataset 80/20 in favor of dataset 1 dataset1_python_weighted = copy.deepcopy(dataset1_python) + [[.8] * 20000] dataset2_python_weighted = copy.deepcopy(dataset2_python) + [[.2] * 20000] ##### combine dataset1 and dataset2 combined_dataset_python = [ dataset1_python_weighted[i] + dataset2_python_weighted[i] for i in range(len(dataset1_python_weighted)) ] combined_dataset_h2o = h2o.H2OFrame(combined_dataset_python) combined_dataset_h2o.set_names(["response", "p1", "p2", "p3", "weights"]) ##### recompute the variable importances. the relative order should be the same as above. model_combined_dataset = h2o.random_forest( x=combined_dataset_h2o[["p1", "p2", "p3"]], y=combined_dataset_h2o["response"], training_frame=combined_dataset_h2o, weights_column="weights") varimp_combined = tuple([p[0] for p in model_combined_dataset.varimp()]) assert varimp_combined == ('p1', 'p2', 'p3'), "Expected the following relative variable importance on the combined " \ "dataset: ('p1', 'p2', 'p3'), but got: {0}".format(varimp_combined) ############ Test2 ############# ##### weight the combined dataset 80/20 in favor of dataset 2 dataset1_python_weighted = copy.deepcopy(dataset1_python) + [[.2] * 20000] dataset2_python_weighted = copy.deepcopy(dataset2_python) + [[.8] * 20000] ##### combine dataset1 and dataset2 combined_dataset_python = [ dataset1_python_weighted[i] + dataset2_python_weighted[i] for i in range(len(dataset1_python_weighted)) ] combined_dataset_h2o = h2o.H2OFrame(combined_dataset_python) combined_dataset_h2o.set_names(["response", "p1", "p2", "p3", "weights"]) ##### recompute the variable importances. the relative order should be the same as above. model_combined_dataset = h2o.random_forest( x=combined_dataset_h2o[["p1", "p2", "p3"]], y=combined_dataset_h2o["response"], training_frame=combined_dataset_h2o, weights_column="weights") varimp_combined = tuple([p[0] for p in model_combined_dataset.varimp()]) assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \ "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined)
def domain_check(): air_train = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_train.show() air_test = h2o.import_file( path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test.show() actual_domain = [u'YES', u'NO'] print "actual domain of the response: {0}".format(actual_domain) ### DRF ### print print "-------------- DRF:" print rf = h2o.random_forest(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train) computed_domain = rf._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GBM ### print print "-------------- GBM:" print gbm = h2o.gbm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, distribution="bernoulli") computed_domain = gbm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### Deeplearning ### print print "-------------- Deeplearning:" print dl = h2o.deeplearning(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"].asfactor(), training_frame=air_train, activation="Tanh", hidden=[2, 2, 2], epochs=10) computed_domain = dl._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = rf.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) ### GLM ### print print "-------------- GLM:" print glm = h2o.glm(x=air_train[[ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ]], y=air_train["IsDepDelayed"], training_frame=air_train, family="binomial") computed_domain = glm._model_json['output'][ 'training_metrics']._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff) perf = glm.model_performance(test_data=air_test) computed_domain = perf._metric_json['domain'] domain_diff = list(set(computed_domain) - set(actual_domain)) assert not domain_diff, "There's a difference between the actual ({0}) and the computed ({1}) domains of the " \ "The difference is {2}".format(actual_domain, computed_domain, domain_diff)