def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly choose which family of GBM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # preload datasets, set x_indices, y_index and change response to factor for classification if 'multinomial' in self.family: self.training_metric = 'logloss' self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() self.scale_model = 1 else: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.scale_model = 0.75 # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def fiftycatRF(): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O DRF Model: #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = "")) model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500) # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with DRF model: #Log.info("Performing predictions on test dataset...\n") preds = model.predict(test) preds.head() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") perf = model.model_performance(test) perf.show() cm = perf.confusion_matrix() print(cm)
def wide_dataset_large(): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') xtrain = np.transpose(trainDataFeatures).tolist() ytrain = trainDataResponse.tolist() trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain) trainData[0] = trainData[0].asfactor() print("Run model on 3250 columns of Arcene with strong rules off.") model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1) model.train(x=range(1,3250), y=0, training_frame=trainData) print("Test model on validation set.") validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') xvalid = np.transpose(validDataFeatures).tolist() yvalid = validDataResponse.tolist() validData = h2o.H2OFrame.fromPython([yvalid]+xvalid) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def consistency_check(): try: small = pyunit_utils.locate("h2o-py/demos/citi_bike_small.ipynb") except ValueError: small = pyunit_utils.locate("h2o-py/demos/citi_bike_small_NOPASS.ipynb") try: large = pyunit_utils.locate("h2o-py/demos/citi_bike_large.ipynb") except ValueError: large = pyunit_utils.locate("h2o-py/demos/citi_bike_large_NOPASS.ipynb") results_dir = pyunit_utils.locate("results") s = os.path.join(results_dir, os.path.basename(small).split('.')[0]+".py") l = os.path.join(results_dir, os.path.basename(large).split('.')[0]+".py") from tests import pydemo_utils pydemo_utils.ipy_notebook_exec(small, save_and_norun = s) pydemo_utils.ipy_notebook_exec(large, save_and_norun = l) small_list = list(open(s, 'r')) large_list = list(open(l, 'r')) for s, l in zip(small_list, large_list): if s != l: assert s == "data = h2o.import_file(path=small_test)\n" and \ l != "data = h2o.import_file(path=large_test)\n", \ "This difference is not allowed between the small and large citibike demos.\nCitibike small: {0}" \ "Citibike large: {1}".format(s,l)
def test_relevel(): #First, compare againts itself print("Importing prostate_cat.csv data...\n") d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"]) mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns = mh2o1.coef().keys() print(ns) assert("DPROS.None" in ns, "None level IS NOT expected to be skipped by default") assert(("DPROS.Both" not in ns), "Both level IS expected to be skipped by default") x = d["DPROS"].relevel("None") print(x) d["DPROS"] = x[0] mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns2 = mh2o2.coef().keys() print(ns2) assert("DPROS.None" in ns2, "None level IS NOT expected to be skipped by default") assert(("DPROS.Both" not in ns2), "Both level IS expected to be skipped by default") #Second, compare against R input (taken from runit_relevel.R) dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) dr["DPROS"] = d["DPROS"].relevel("None") #Results are from R but manualy reordered and renamed to match h2o naming and order exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233, "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927} coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()} assert (max(coeff_diff.values()) < 1e-4)
def xgboost_milsongs_gaussian_medium(): assert H2OXGBoostEstimator.available() # Import big dataset to ensure run across multiple nodes training_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) test_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) x = list(range(1,training_frame.ncol)) y = 0 # Model with maximum of 2 trees model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3, booster='gbtree', seed=1, ntrees=2, distribution='gaussian') model_2_trees.train(x=x, y=y, training_frame=training_frame) prediction_2_trees = model_2_trees.predict(test_frame) assert prediction_2_trees.nrows == test_frame.nrows # Model with 10 trees model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3, booster='gbtree', seed=1, ntrees=10, distribution='gaussian') model_10_trees.train(x=x, y=y, training_frame=training_frame) prediction_10_trees = model_10_trees.predict(test_frame) assert prediction_10_trees.nrows == test_frame.nrows ## Mean square error on model with lower number of decision trees should be higher assert model_2_trees.mse() > model_10_trees.mse()
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) from h2o.estimators.kmeans import H2OKMeansEstimator for i in range(1,7): benign_h2o_km = H2OKMeansEstimator(k=i) benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_
def iris_h2o_vs_sciKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print("Cluster centers from H2O:") h2o_centers = h2o_km.centers() print(h2o_centers) # Log.info("Cluster centers from scikit:") print("Cluster centers from scikit:") sci_centers = sci_km.cluster_centers_.tolist() print(sci_centers) for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def table_check(): df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True)) print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True)) print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True)) print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True)) iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) # single column (frame) table1 = iris["C5"].table() assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1]) assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1]) assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1]) # two-column (one argument) #dense table2 = iris["C1"].table(iris["C5"]) #not dense table3 = iris["C1"].table(iris["C5"],dense=False) #check same value assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all() assert (table2 == iris[["C1","C5"]].table()).all() assert (table3 == iris[["C1","C5"]].table(dense=False)).all() cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) table = cars[2].table().as_data_frame() table = dict(table[1:]) table = {k:int(v) for k,v in list(table.items())} expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:])) assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
def offsets_and_distributions(): # cars cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame.fromPython([[.5] for x in range(398)]) offset.set_name(0,"x1") cars = cars.cbind(offset) # insurance insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() # bernoulli - offset not supported #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1", # training_frame=cars) #predictions = dl.predict(cars) # gamma dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # gaussian dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # poisson dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance) # tweedie dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance) predictions = dl.predict(insurance)
def col_names_check(): iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names) df = h2o.H2OFrame.from_python(list(zip(*np.random.randn(100,4).tolist())), column_names=list("ABCD"), column_types=["enum"]*4) df.head() assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names) assert list(df.types.values()) == ["enum"]*4, "Expected {} for column types but got {}".format(["enum"]*4, df.types) df = h2o.H2OFrame(list(zip(*np.random.randn(100,4).tolist()))) df.head() assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"] , df.col_names) assert list(df.types.values()) == ["real"]*4, "Expected {} for column types but got {}".format(["real"]*4, df.types) df = h2o.H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']}) df.head() assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names) df = h2o.H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"]) df.head() assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
def user(): a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4] a.head() print(a[0].names) # Column header print(a[2,0]) # column 0, row 2 value print(a[2,"sepal_len"]) # Column 0, row 2 value (a[0] + 2).show() # Add 2 to every element; broadcast a constant (a[0] + a[1]).show() # Add 2 columns; broadcast parallel add sum(a).show() print(a["sepal_len"].mean()) print() print("Rows 50 through 77 in the `sepal_len` column") a[50:78, "sepal_len"].show() # print out rows 50 thru 77 inclusive print() a["sepal_len"].show() print(a[50:78, ["sepal_len", "sepal_wid"]].show()) a.show() print("The column means: ") print(a.mean()) print() try: print(a["Sepal_len"].dim) # Error, misspelt column name except Exception: pass # Expected error b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4] c = a + b d = c + c + sum(a) e = c + a + 1 e.show() # Note that "d=c+..." keeps the internal C expressions alive, until "d" goes # out of scope even as we nuke "c" c.show() c = None # Internal "ExprNode(c=a+b)" not dead! print(1 + (a[0] + b[1]).mean()) import collections c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]})) c.show() c.describe() c.head() c[0].show() print(c[1,0]) c[0:2,0].show() sliced = a[0:51,0] sliced.show()
def checkpoint_new_category_in_predictor(): sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv")) vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv")) print("checkpoint_new_category_in_predictor-1") m1 = H2ODeepLearningEstimator(epochs=100) m1.train(x=[0,1,2,4], y=3, training_frame=sv1) m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m2.train(x=[0,1,2,4], y=3, training_frame=sv2) print("checkpoint_new_category_in_predictor-2") # attempt to continue building model, but with an expanded categorical predictor domain. # this should fail try: m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id) m3.train(x=[0,1,2,4], y=3, training_frame=vir) assert False, "Expected continued model-building to fail with new categories introduced in predictor" except EnvironmentError: pass print("checkpoint_new_category_in_predictor-3") # attempt to predict on new model, but with observations that have expanded categorical predictor domain. predictions = m2.predict(vir) print("checkpoint_new_category_in_predictor-4")
def stackedensemble_metalearner_seed_test(): # Import training set train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 #Metalearner params for gbm, drf, glm, and deep deeplearning gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3} # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) #Train two SE models with same metalearner seeds stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm1.train(x=x, y=y, training_frame=train) stack_gbm2.train(x=x, y=y, training_frame=train) meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name']) meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name']) assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed" #Train two SE models with diff metalearner seeds stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 98765) stack_gbm3.train(x=x, y=y, training_frame=train) stack_gbm4.train(x=x, y=y, training_frame=train) meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name']) meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name']) assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
def smallcat_gbm(): # Training set has 26 categories from A to Z # Categories A, C, E, G, ... are perfect predictors of y = 1 # Categories B, D, F, H, ... are perfect predictors of y = 0 alphabet = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv")) alphabet["y"] = alphabet["y"].asfactor() #Log.info("Summary of alphabet_cattest.csv from H2O:\n") #alphabet.summary() # Prepare data for scikit use trainData = np.loadtxt(pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.decode().split("\"")[1])}) trainDataResponse = trainData[:,1] trainDataFeatures = trainData[:,0] # Train H2O GBM Model: gbm_h2o = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) gbm_h2o.train(x="X",y="y", training_frame=alphabet) gbm_h2o.show() # Train scikit GBM Model: # Log.info("scikit GBM with same parameters:") gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None) gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def glrm_catagorical_bug_fix(): trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) glrmModel = H2OGeneralizedLowRankEstimator(k=4) glrmModel.train(x=trainData.names, training_frame=trainData) predV = glrmModel.predict(testData) print(predV)
def dim_checks(): # Log.info("Uploading logreg/princeton/cuse.dat") h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) np_data = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1) h2o_rows, h2o_cols = h2o_data.dim np_rows, np_cols = list(np_data.shape) print('The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols)) print('The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols)) assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows" # Log.info("Slice out a column and data frame it, try dim on it...") h2o_slice = h2o_data[4] np_slice = np_data[:,4] h2o_rows, h2o_cols = h2o_slice.dim np_rows = np_slice.shape[0] print('The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols)) print('The dimensions of numpy array column slice is: {0} x 1'.format(np_rows)) assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows" # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...") h2oColAmpFive = h2o_slice & 5 assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
def link_functions_gaussian(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")). open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,9] sm_data_features = sm_data[:,1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = old_div(h2o_model.residual_deviance(), h2o_model.null_deviance()) sm_deviance = old_div(sm_model.deviance, sm_model.null_deviance) assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def xgboost_insurance_gaussian_small(): assert H2OXGBoostEstimator.available() # Import big dataset to ensure run across multiple nodes training_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_train1.csv")) test_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_validation1.csv")) x = ['Age', 'District'] y = 'Claims' # Model with maximum of 2 trees model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7, booster='gbtree', seed=1, ntrees=2, distribution='gaussian') model_2_trees.train(x=x, y=y, training_frame=training_frame) prediction_2_trees = model_2_trees.predict(test_frame) assert prediction_2_trees.nrows == test_frame.nrows # Model with 10 trees model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7, booster='gbtree', seed=1, ntrees=10, distribution='gaussian') model_10_trees.train(x=x, y=y, training_frame=training_frame) prediction_10_trees = model_10_trees.predict(test_frame) assert prediction_10_trees.nrows == test_frame.nrows ## Mean square error on model with lower number of decision trees should be higher assert model_2_trees.mse() > model_10_trees.mse()
def plot_test(): kwargs = {} kwargs['server'] = True air = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) # Constructing test and train sets by sampling (20/80) s = air[0].runif() air_train = air[s <= 0.8] air_valid = air[s > 0.8] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs)
def anomaly(): print("Deep Learning Anomaly Detection MNIST") train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz")) test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz")) predictors = list(range(0,784)) resp = 784 # unsupervised -> drop the response column (digit: 0-9) train = train[predictors] test = test[predictors] # 1) LEARN WHAT'S NORMAL # train unsupervised Deep Learning autoencoder model on train_hex ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1) ae_model.train(x=predictors,training_frame=train) # 2) DETECT OUTLIERS # anomaly app computes the per-row reconstruction error for the test data set # (passing it through the autoencoder model and computing mean square error (MSE) for each row) test_rec_error = ae_model.anomaly(test) # 3) VISUALIZE OUTLIERS # Let's look at the test set points with low/median/high reconstruction errors. # We will now visualize the original test set points and their reconstructions obtained # by propagating them through the narrow neural net. # Convert the test data into its autoencoded representation (pass through narrow neural net) test_recon = ae_model.predict(test)
def fiftycatGBM(): # Training set has only 45 categories cat1 through cat45 #Log.info("Importing 50_cattest_train.csv data...\n") train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv")) train["y"] = train["y"].asfactor() #Log.info("Summary of 50_cattest_train.csv from H2O:\n") #train.summary() # Train H2O GBM Model: #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = "")) model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20) model.show() # Test dataset has all 50 categories cat1 through cat50 #Log.info("Importing 50_cattest_test.csv data...\n") test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv")) #Log.info("Summary of 50_cattest_test.csv from H2O:\n") #test.summary() # Predict on test dataset with GBM model: #Log.info("Performing predictions on test dataset...\n") predictions = model.predict(test) predictions.show() # Get the confusion matrix and AUC #Log.info("Confusion matrix of predictions (max accuracy):\n") performance = model.model_performance(test) test_cm = performance.confusion_matrix() test_auc = performance.auc()
def shuffling_large(): print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create second model on original Arcene dataset.") h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create model on shuffled Arcene dataset.") h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def minmax_basic(): print "Uploading iris data..." iris_h2o = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"), delimiter=",", skip_header=1) print "Computing min & max of the first column of iris..." iris1_min = iris_h2o[0].min() print "Minimum: {0}".format(iris1_min) iris1_max = iris_h2o[0].max() print "Maximum: {0}".format(iris1_max) np_min = iris_np[:,0].min() np_max = iris_np[:,0].max() assert iris1_min == np_min, "Expected the same min value. H2O got {0}, but numpy got {1}".format(iris1_min, np_min) assert iris1_max == np_max, "Expected the same max value. H2O got {0}, but numpy got {1}".format(iris1_max, np_max) print "Computing min & max of all numeric columns of iris..." irisall_min = iris_h2o[0:4].min() print "Minimum: {0}".format(irisall_min) irisall_max = iris_h2o[0:4].max() print "Maximum: {0}".format(irisall_max) np_min = iris_np[:,0:4].min() np_max = iris_np[:,0:4].max() assert irisall_min == np_min, "Expected the same min value. H2O got {0}, but numpy got {1}".format(irisall_min, np_min) assert irisall_max == np_max, "Expected the same max value. H2O got {0}, but numpy got {1}".format(irisall_max, np_max) print "min and max correctness" data = [1,-0.1,0] mn = min(data) mx = max(data) h2o_min = h2o.H2OFrame(data).min() h2o_max = h2o.H2OFrame(data).max() assert h2o_min == mn, "Expected the same min value. H2O got {0}, but python got {1}".format(h2o_min, mn) assert h2o_max == mx, "Expected the same max value. H2O got {0}, but python got {1}".format(h2o_max, mx)
def export_file(): pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_hex[1] = pros_hex[1].asfactor() pros_hex[3] = pros_hex[3].asfactor() pros_hex[4] = pros_hex[4].asfactor() pros_hex[5] = pros_hex[5].asfactor() pros_hex[8] = pros_hex[8].asfactor() p_sid = pros_hex.runif() pros_train = pros_hex[p_sid > 0.2, :] pros_test = pros_hex[p_sid <= 0.2, :] glm = H2OGeneralizedLinearEstimator(family="binomial") myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train) mypred = glm.predict(pros_test) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return "".join(random.choice(chars) for _ in range(size)) fname = id_generator() + "_prediction.csv" path = pyunit_utils.locate("results") dname = path + "/" + fname h2o.export_file(mypred, dname) py_pred = pd.read_csv(dname) print(py_pred.head()) h_pred = mypred.as_data_frame(True) print(h_pred.head()) # Test to check if py_pred & h_pred are identical assert_frame_equal(py_pred, h_pred)
def wide_dataset_large(): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance"
def bigcat_gbm(): covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtypeTest[54] = covtype[54].asfactor() regular = H2OGradientBoostingEstimator(ntrees=10, seed=1234) regular.train(x=list(range(54)), y=54, training_frame=covtype) # do prediction on original dataset, no warnings check_warnings(regular, 0, covtypeTest) # drop response, no warnings covtypeTest = covtypeTest.drop(54) check_warnings(regular, 0, covtypeTest) covtypeTest = covtypeTest.drop(1) covtypeTest=covtypeTest.drop(1) check_warnings(regular, 2, covtypeTest) covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) covtypeTest[54] = covtype[54].asfactor() covtypeTest=covtypeTest.drop(3) covtypeTest=covtypeTest.drop(5) covtypeTest=covtypeTest.drop(7) check_warnings(regular, 3, covtypeTest)
def import_multi(): airlines = h2o.import_file(path=[ pyunit_utils.locate("smalldata/testng/airlines_train.csv"), pyunit_utils.locate("smalldata/testng/airlines_test.csv") ]) assert airlines.nrows == 24421 + 2691
def link_functions_binomial(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance() sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def glm_alpha_arrays_null_lambda_cv(): print( "Testing glm cross-validation with alpha array, default lambda values for binomial models." ) h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) enum_columns = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10" ] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" h2o_data["C21"] = h2o_data["C21"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='binomial', alpha=[0.1, 0.5, 0.9], nfolds=3) cv_model.train(training_frame=training_data, x=myX, y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='binomial', alpha=[0.1, 0.5, 0.9], nfolds=3) cv_model_valid.train(training_frame=training_data, validation_frame=test_data, x=myX, y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0, len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def hexdev_394(): path = pyunit_utils.locate("smalldata/covtype/covtype.20k.data") c_types = [None] * 55 c_types[10] = "enum" c_types[11] = "enum" c_types[12] = "enum" train = h2o.import_file(path, col_types=c_types) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain[54] = newtrain[54].asfactor() newvalid[54] = newvalid[54].asfactor() my_gbm = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) my_gbm.train(x=x_cols, y=54, training_frame=newtrain, validation_frame=newvalid) split1, split2 = train.split_frame() split1[54] = split1[54].asfactor() split2[54] = split2[54].asfactor() my_gbm = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) my_gbm.train(x=x_cols, y=54, training_frame=split1, validation_frame=split2)
def test_interaction_constraints(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate.describe() prostate[1] = prostate[1].asfactor() constraints = [["AGE", "PSA"], ["GLEASON"]] ntrees = 5 prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=ntrees, interaction_constraints=constraints, seed=42) prostate_gbm.train(x=list(range(2, 9)), y=1, training_frame=prostate) prostate_gbm.predict(prostate) importance = prostate_gbm.varimp(use_pandas=True) print(importance) # variables RACE, DPROS, DCAPS, VOL should have zero importance assert importance["variable"][3] == "RACE" assert importance["relative_importance"][3] == 0 assert importance["variable"][4] == "DPROS" assert importance["relative_importance"][4] == 0 assert importance["variable"][5] == "DCAPS" assert importance["relative_importance"][5] == 0 assert importance["variable"][6] == "VOL" assert importance["relative_importance"][6] == 0 # check trees features for i in range(ntrees): tree = H2OTree(model=prostate_gbm, tree_number=i) tree_features = set(filter(None, tree.features)) print("iteration: "+str(i)) print(set(constraints[0])) print(set(constraints[1])) print(tree_features) assert tree_features.issubset(set(constraints[0])) or tree_features.issubset(set(constraints[1]))
def test_gridsearch(): h2o_data = h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) h2o_data['response'] = h2o_data['response'].asfactor() h2o_data['C3'] = h2o_data['C3'].asfactor() h2o_data['C7'] = h2o_data['C7'].asfactor() h2o_data['C8'] = h2o_data['C8'].asfactor() h2o_data['C10'] = h2o_data['C10'].asfactor() names = h2o_data.names myY = "response" myX = names.remove(myY) search_criteria = {'strategy': 'Cartesian'} hyper_parameters = {'lambda': [1, 2], 'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]]}, {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs':[[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]} hyper_parameters2 = {'lambda': [1, 2], 'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]]}, {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs':[[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]} h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1), hyper_params=hyper_parameters, search_criteria=search_criteria) h2o_model.train(x = myX, y = myY, training_frame = h2o_data) h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1), hyper_params=hyper_parameters2, search_criteria=search_criteria) h2o_model2.train(x = myX, y = myY, training_frame = h2o_data) # compare two models by checking their coefficients. They should be the same for index in range(0, len(h2o_model)): model1 = h2o_model[index] model2 = h2o_model2[index] pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)
def cv_nfolds_gbm(): loan_data = h2o.import_file( path=pyunit_utils.locate("bigdata/laptop/lending-club/loan.csv")) loan_data["bad_loan"] = loan_data["bad_loan"].asfactor() try: # parallel main model building cannot be used when we use best CV iterations right now set_best_cv(False) model_default = H2OGradientBoostingEstimator(nfolds=5, distribution="bernoulli", ntrees=500, score_tree_interval=3, stopping_rounds=2, seed=42) try: set_parallel(True) model_default.train(y="bad_loan", training_frame=loan_data) finally: set_parallel(False) preds_default = model_default.predict(loan_data) model_sequential = H2OGradientBoostingEstimator( nfolds=5, distribution="bernoulli", ntrees=500, score_tree_interval=3, stopping_rounds=2, seed=42) model_sequential.train(y="bad_loan", training_frame=loan_data) preds_sequential = model_sequential.predict(loan_data) assert model_default.actual_params[ "ntrees"] == model_sequential.actual_params["ntrees"] pyunit_utils.compare_frames_local(preds_default, preds_sequential, prob=1.0) finally: set_best_cv(True)
def h2ogrid_checkpoints(): """ Python API test: H2OGridSearch with export_checkpoints_dir Copy from pyunit_gbm_random_grid.py """ air_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex") myX = ["DayofMonth", "DayOfWeek"] hyper_parameters = { 'ntrees': [5, 10] } search_crit = {'strategy': "RandomDiscrete", 'max_models': 5, 'seed': 1234, 'stopping_rounds' : 2, 'stopping_metric' : "AUTO", 'stopping_tolerance': 1e-2 } checkpoints_dir = tempfile.mkdtemp() air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit) air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli", learn_rate=0.1, max_depth=3, nfolds=3, export_checkpoints_dir=checkpoints_dir) checkpoint_files = listdir(checkpoints_dir) print(checkpoint_files) num_files = len(checkpoint_files) shutil.rmtree(checkpoints_dir) assert_is_type(air_grid, H2OGridSearch) assert num_files == 1 + (2 * (1 + 3)), "Unexpected number of checkpoint files" # 1 grid + 1 main model + 3 CV models for each model built assert all(model in checkpoint_files for model in air_grid.get_grid().model_ids), \ "Some models do not have corresponding checkpoints"
def iris_dl_grid(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run DL hidden_opts = [[20, 20], [50, 50, 50]] loss_opts = ["Quadratic", "CrossEntropy"] size_of_hyper_space = len(hidden_opts) * len(loss_opts) hyper_parameters = {"hidden": hidden_opts, "loss": loss_opts} print "DL grid with the following hyper_parameters:", hyper_parameters gs = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters) gs.train(x=range(4), y=4, training_frame=train) print gs.sort_by("mse") assert len(gs) == size_of_hyper_space total_grid_space = map(list, itertools.product(*hyper_parameters.values())) for model in gs.models: combo = [model.parms['loss']['actual_value'] ] + [model.parms['hidden']['actual_value']] assert combo in total_grid_space total_grid_space.remove(combo)
def pubdev_random_cv(): cars = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) response_col = "economy" distribution = "gaussian" predictors = ["displacement", "power", "weight", "acceleration", "year"] gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, distribution=distribution, fold_assignment="Random") mse1 = gbm1.mse(xval=True) mse2 = gbm2.mse(xval=True) assert mse1 != mse2, "The first model has an MSE of {0} and the second model has an MSE of {1}. Expected the " \ "first to be different from the second.".format(mse1, mse2)
def h2oimport_file(): """ Python API test: h2o.import_file(path=None, destination_frame=None, parse=True, header=0, sep=None, col_names=None, col_types=None, na_strings=None) """ try: col_types=['enum','numeric','enum','enum','enum','numeric','numeric','numeric'] col_headers = ["CAPSULE","AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] hex_key = "training_data.hex" training_data = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), destination_frame=hex_key, header=1, sep = ',', col_names=col_headers, col_types=col_types, na_strings=["NA"]) assert_is_type(training_data, H2OFrame) assert training_data.frame_id == hex_key, "frame_id was not assigned correctly. h2o.import_file() is not" \ " working." assert len(set(training_data.col_names) & set(col_headers))==len(col_headers), "column names are incorrect. " \ "h2o.import_file() not working." assert training_data.nrow==380, "number of rows is incorrect. h2o.import_file() is not working." assert training_data.ncol==8, "number of columns is incorrect. h2o.import_file() is not working." assert sum(training_data.nacnt())==3, "NA count is incorrect. h2o.import_file() is not working." except Exception as e: assert False, "h2o.import_file() command is not working."
def test_saved_binary_model_produces_same_predictions_as_original(): ds = prepare_data(blending) base_models = train_base_models(ds) se_model = train_stacked_ensemble(ds, base_models) #Predict in ensemble in Py client preds_py = se_model.predict(ds.test) tmp_dir = tempfile.mkdtemp() try: bin_file = h2o.save_model(se_model, tmp_dir) #Load binary model and predict bin_model = h2o.load_model(pu.locate(bin_file)) preds_bin = bin_model.predict(ds.test) finally: shutil.rmtree(tmp_dir) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11
def cv_nfolds_gbm(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() prostate.summary() from h2o.estimators.gbm import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(nfolds=5, distribution="bernoulli") prostate_gbm.train(x=range(2, 9), y=1, training_frame=prostate) prostate_gbm.show() # Can specify both nfolds >= 2 and validation data at once try: H2OGradientBoostingEstimator(nfolds=5, distribution="bernoulli").train( x=range(2, 9), y=1, training_frame=prostate, validation_frame=prostate) assert True except EnvironmentError: assert False, "expected an error"
def pubdev_5167(): training_data = h2o.import_file(pyunit_utils.locate("bigdata/laptop/airlines_all.05p.csv")) if 'IsDepDelayed' in training_data.names: training_data['IsDepDelayed'] = training_data['IsDepDelayed'].asfactor() else: raise AttributeError("label {0} not found".format('IsDepDelayed')) estimator = h2o.estimators.deeplearning.H2ODeepLearningEstimator(hidden=[50, 50, 50, 50, 50], activation='rectifier', adaptive_rate=True, balance_classes=True, epochs=50, shuffle_training_data=True, score_each_iteration=True, stopping_metric='auc', stopping_rounds=5, stopping_tolerance=.01, use_all_factor_levels=False, variable_importances=False, export_weights_and_biases=True, seed=200) estimator.train(x=training_data.names[:-1], y=training_data.names[-1], training_frame=training_data)
def mojo_model_glm_test(): # GLM airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) glm = H2OGeneralizedLinearEstimator() glm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines) original_model_filename = tempfile.mkdtemp() original_model_filename = glm.download_mojo(original_model_filename) model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def test_target_encoding_fit_method(): print("Check fit method of the TargetEncoder class") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blended_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == teColumns assert encodingMap.frames[0]['num_rows'] == 583
def pubdev_5174(): x = h2o.import_file(pyunit_utils.locate('smalldata/jira/PUBDEV-5174.csv'), header=1) tt = x['rr'].unique() gg = tt[:10000, 0] ww = x[~x['rr'].isin(gg['C1'].as_data_frame()['C1'].tolist())] print(x.nrow) print(tt.nrow) print(ww.nrow) assert x.nrow == 1000000, "Original data has 1000000 rows" assert tt.nrow == 499851, "Column rr has 499851 unique values" assert ww.nrow == 979992, "Original data reduced has 979992 rows" # What do we do with Tuples? # there are 2 instances of 'cTeYX' and 2 of 'Todxf' tup = ('cTeYX', 'Todxf') ww_tuple = x[~x['rr'].isin(tup)] assert ww_tuple.nrow == 999996, "Original data reduced has 999996 rows"
def test_infogram_personal_loan_cv_valid(): """ Make sure safe infogram plot works with cv and validation dataset. """ fr = h2o.import_file(path=pyunit_utils.locate("smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv")) target = "Personal Loan" fr[target] = fr[target].asfactor() x = ["Experience","Income","Family","CCAvg","Education","Mortgage", "Securities Account","CD Account","Online","CreditCard"] splits = fr.split_frame(ratios=[0.80]) train = splits[0] test = splits[1] infogram_model_cv_v = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"], nfolds=5) infogram_model_cv_v.train(x=x, y=target, training_frame=train, validation_frame=test) # cross-validation, validation infogram_model_cv_v.plot(title="Infogram calcuated from training dataset", server=True) # plot infogram from training dataset infogram_model_cv_v.plot(train=True, valid=True, title="Infogram calculated from training/validation dataset", server=True) # plot infogram from validation dataset infogram_model_cv_v.plot(train=True, valid=True, xval=True, title="Infogram calculated from " "training/validation/xval holdout dataset", server=True) # plot infogram from cv hold out dataset relcmi_train = infogram_model_cv_v.get_admissible_score_frame() relcmi_valid = infogram_model_cv_v.get_admissible_score_frame(valid=True) assert relcmi_train.nrow==relcmi_valid.nrow
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # preload datasets self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def pca_prostate(): print("Importing prostate.csv data...\n") prostate = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors") prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print( "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" ) fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power") fitPCA.train(x=list(range(2, 9)), training_frame=prostate) pred = fitPCA.predict(prostate) print("Projection matrix:\n") pred.head()
def perfect_separation_balanced(): print("Read in synthetic balanced dataset") data = h2o.import_file(path=pyunit_utils.locate( "smalldata/synthetic_perfect_separation/balanced.csv")) print("Fit model on dataset") model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5, Lambda=1e-8) model.train(x=["x1", "x2"], y="y", training_frame=data) print( "Extract models' coefficients and assert reasonable values (ie. no greater than 50)" ) print("Balanced dataset") coef = [ c[1] for c in model._model_json['output']['coefficients_table'].cell_values if c[0] != "Intercept" ] for c in coef: assert c < 50, "coefficient is too large"
def group_by(): ''' This test checks that if a groupby operation is specified for frames with string columns, a warning is generated about the string columns being skipped. In addition, it checks that operations on numeric/enum columns are performed and generated the correct expected outputs. ''' # Connect to a pre-existing cluster buffer = StringIO() # redirect output sys.stderr = buffer h2o_f1 = h2o.import_file(path=pyunit_utils.locate( "smalldata/jira/test_groupby_with_strings.csv"), col_types=['real', 'string', 'string', 'real']) grouped = h2o_f1.group_by("C1") grouped.mean(na="all").median(na="all").max(na="all").min(na="all").sum( na="all") print(grouped.get_frame()) print("Checking number of warning messages...") check_warnings( 2, buffer) # make sure we receieved two warning, one per string row
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print("H2O initial Y matrix:\n") initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268], [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]] initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y))) initial_y_h2o.show() print("H2O GLRM on de-meaned data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_y=initial_y_h2o, recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show()
def glrm_arrests(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print "H2O initial Y matrix:\n" initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268], [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]] initial_y_h2o = h2o.H2OFrame(initial_y) initial_y_h2o.show() print "H2O GLRM on de-meaned data with quadratic loss:\n" glrm_h2o = h2o.glrm(x=arrestsH2O, k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_y=initial_y_h2o, recover_svd=True) glrm_h2o.show()
def bigcatRF(): # Training set has 100 categories from cat001 to cat100 # Categories cat001, cat003, ... are perfect predictors of y = 1 # Categories cat002, cat004, ... are perfect predictors of y = 0 #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O DRF Model: #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n") model = h2o.random_forest(x=bigcat[["X"]], y=bigcat["y"], ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.show()
def link_functions_tweedie_basic(): print "Read in prostate data." hdf = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) print "Testing for family: TWEEDIE" print "Set variables for h2o." y = "CAPSULE" x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"] print "Create models with canonical link: TWEEDIE" model_h2o_tweedie = H2OGeneralizedLinearEstimator(family="tweedie", link="tweedie", alpha=0.5, Lambda=0) model_h2o_tweedie.train(x=x, y=y, training_frame=hdf) print "Compare model deviances for link function tweedie (using precomputed values from R)" deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance( ) / model_h2o_tweedie.null_deviance() assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \ "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
def parametersKmeans(): print "Getting data..." iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) print "Create and and duplicate..." iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) print "wss" wss = iris_km.withinss().sort() wss_again = iris_km_again.withinss().sort() assert wss == wss_again, "expected wss to be equal" print "centers" centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same"
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) self.h2o_data["C1"] = self.h2o_data["C1"].asfactor() self.h2o_data["C2"] = self.h2o_data["C2"].asfactor() self.myX = ["C1", "C2"] self.myY = "C21" for lambda_param in self.hyper_parameters['lambda']: for alpha_param in self.hyper_parameters['alpha']: self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="gaussian", gam_columns=["C11", "C12", "C13"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5], alpha=alpha_param, lambda_=lambda_param, bs=[2, 0, 2]))
def vi_toy_test(): toy_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/toy_data_RF.csv")) #toy_data.summary() toy_data[6] = toy_data[6].asfactor() toy_data.show() rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0) ranking = [ rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol - 1) ] print(ranking) assert tuple(ranking) == tuple( ["V3", "V2", "V6", "V5", "V1", "V4"]), "expected specific variable importance ranking"
def weights_gamma(): htable = h2o.upload_file( pyunit_utils.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] hh = H2OGradientBoostingEstimator(distribution="gamma", ntrees=20, max_depth=1, min_rows=1, learn_rate=1) hh.train(x=range(3), y="medskad", training_frame=htable, weights_column="antskad") ph = hh.predict(htable) assert abs(8.804447 - hh._model_json['output']['init_f']) < 1e-6 * 8.804447 assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01 assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87 assert abs(8121.98 - ph[0].mean()[0]) < 1e-4 * 8121.98
def test_teColumns_parameter_as_single_element(): print("Check fit method can accept non-array single column to encode") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = "home.dest" targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == [teColumns] assert encodingMap.frames[0]['num_rows'] == 583
def group_by(): # Connect to a pre-existing cluster h2o_iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) na_handling = ["rm", "ignore", "all"] print("Running smoke test") # smoke test for na in na_handling: grouped = h2o_iris.group_by("class") grouped \ .count(na=na) \ .min( na=na) \ .max( na=na) \ .mean( na=na) \ .var( na=na) \ .sd( na=na) \ .ss( na=na) \ .sum( na=na) print(grouped.get_frame()) print(grouped.get_frame() ) # call get_frame() again to ensure that Pasha bug fix works.