def main(do_tests=False, do_bench=False): # Neither tests nor benchmarks are being run. if not do_tests and not do_bench: print("test: neither tests nor benchmarks are enabled") parser.print_help() return if do_tests: print("[!] Running test suite!") tests.run_test() if do_bench: print("[!] Running benchmarks! (may take some time)") tests.run_bench()
import sys sys.path.insert(1, "../../") import h2o, tests def vec_as_list(): iris = h2o.import_file( path=tests.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \ abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res2 = h2o.as_list(res[0], use_pandas=False) assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \ abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values" res3 = h2o.as_list(res[1], use_pandas=False) assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \ abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values" if __name__ == "__main__": tests.run_test(sys.argv, vec_as_list)
assert abs(1515.91815848623 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model without offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(216.339989007507) assert abs(216.339989007507 - prostate_glm_h2o.residual_deviance()) < 0.1 print "Checking binomial model with offset..." prostate_glm_h2o = h2o.glm( x=prostate_hex[["RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "AGE"]], y=prostate_hex["CAPSULE"], training_frame=prostate_hex, family="poisson", offset_column="AGE", standardize=False, ) print "h2o residual: {0}".format(prostate_glm_h2o.residual_deviance()) print "r residual: {0}".format(2761.76218461138) assert abs(2761.76218461138 - prostate_glm_h2o.residual_deviance()) < 0.1 if __name__ == "__main__": tests.run_test(sys.argv, offset_1897)
import sys sys.path.insert(1, "../../../") import h2o, tests def weights_and_distributions(ip,port): htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv")) htable["premiekl"] = htable["premiekl"].asfactor() htable["moptva"] = htable["moptva"].asfactor() htable["zon"] = htable["zon"] # gamma dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="gamma",weights_column="antskad") predictions = dl.predict(htable) # gaussian dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="gaussian",weights_column="antskad") predictions = dl.predict(htable) # poisson dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="poisson",weights_column="antskad") predictions = dl.predict(htable) # tweedie dl = h2o.deeplearning(x=htable[0:3],y=htable["medskad"],training_frame=htable,distribution="tweedie",weights_column="antskad") predictions = dl.predict(htable) if __name__ == "__main__": tests.run_test(sys.argv, weights_and_distributions)
import sys sys.path.insert(1, "../../") import h2o, tests def rep_len_check(): # Connect to a pre-existing cluster iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) # data is single column (vec) vec = iris[0].rep_len(length_out=301) assert vec.nrow == 301, "Expected an H2OVec with 301 rows, but got {0} rows".format(vec.nrow) for r in range(len(vec)): assert vec[r,:] == vec[r % 150,:], "Expected {0}, but got {1}".format(vec[r % 150,:], vec[r,:]) # data is frame fr = iris.rep_len(length_out=7) assert fr.nrow == 150 and fr.ncol == 7, "Expected an H2OFrame with 150 rows and 7 columns, but got {0} rows and {1} cols".format(fr.nrow, fr.ncol) if __name__ == "__main__": tests.run_test(sys.argv, rep_len_check)
import sys sys.path.insert(1, "../../") import h2o, tests def javapredict_iris_drf(): # optional parameters params = {'ntrees':100, 'max_depth':5, 'min_rows':10} print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv")) x = ["sepal_len","sepal_wid","petal_len","petal_wid"] y = "species" tests.javapredict("random_forest", "class", train, test, x, y, **params) if __name__ == "__main__": tests.run_test(sys.argv, javapredict_iris_drf)
import sys sys.path.insert(1, "../../../") import h2o, tests def get_modelGBM(): prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv")) prostate.describe() prostate[1] = prostate[1].asfactor() prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], distribution="bernoulli") prostate_gbm.show() prostate_gbm.predict(prostate) model = h2o.get_model(prostate_gbm._id) model.show() if __name__ == "__main__": tests.run_test(sys.argv, get_modelGBM)
import sys sys.path.insert(1,"../../../") import h2o, tests def gbm_mean_residual_deviance(): cars = h2o.import_file(path=tests.locate("smalldata/junit/cars_20mpg.csv")) s = cars[0].runif() train = cars[s > 0.2] valid = cars[s <= 0.2] predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy" gbm = h2o.gbm(x=train[predictors], y=train[response_col], validation_x=valid[predictors], validation_y=valid[response_col], nfolds=3) gbm_mrd = gbm.mean_residual_deviance(train=True,valid=True,xval=True) assert isinstance(gbm_mrd['train'],float), "Expected training mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['train'])) assert isinstance(gbm_mrd['valid'],float), "Expected validation mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['valid'])) assert isinstance(gbm_mrd['xval'],float), "Expected cross-validation mean residual deviance to be a float, but got " \ "{0}".format(type(gbm_mrd['xval'])) if __name__ == '__main__': tests.run_test(sys.argv, gbm_mean_residual_deviance)
# build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(n_components=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) params = { "standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__n_components": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:], iris[0]) print random_search.best_estimator_ if __name__ == "__main__": tests.run_test(sys.argv, scale_pca_rf_pipe)
import sys sys.path.insert(1, "../../") import h2o, tests def colname_set_basic(): print "Uploading iris data..." no_headers = h2o.upload_file(tests.locate("smalldata/iris/iris.csv")) headers_and = h2o.upload_file( tests.locate("smalldata/iris/iris_header.csv")) print no_headers.names print headers_and.names no_headers.set_names(headers_and.names) assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\ format(no_headers.names, headers_and.names) if __name__ == "__main__": tests.run_test(sys.argv, colname_set_basic)
import sys sys.path.insert(1, "../../") import h2o, tests def vec_show(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### res = 2 - iris res2 = res[0] print "res2:" res2.show() res3 = res[1] print "res3:" res3.show() iris[2].show() if __name__ == "__main__": tests.run_test(sys.argv, vec_show)
import sys sys.path.insert(1, "../../../") import h2o, tests def checkpoint_new_category_in_response(): sv = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv")) iris = h2o.upload_file(tests.locate("smalldata/iris/iris.csv")) m1 = h2o.gbm(x=sv[[0, 1, 2, 3]], y=sv[4], ntrees=100) # attempt to continue building model, but with an expanded categorical response domain. # this should fail try: m2 = h2o.gbm(x=iris[[0, 1, 2, 3]], y=iris[4], ntrees=200, checkpoint=m1.model_id) assert False, "Expected continued model-building to fail with new categories introduced in response" except EnvironmentError: pass if __name__ == '__main__': tests.run_test(sys.argv, checkpoint_new_category_in_response)
def benign(): training_data = h2o.import_file( tests.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4, 11) #Log.info("Build the model") model = h2o.glm(y=training_data[Y].asfactor(), x=training_data[X], family="binomial", alpha=[0], Lambda=[1e-5]) #Log.info("Check that the columns used in the model are the ones we passed in.") #Log.info("===================Columns passed in: ================") in_names = [training_data.names[i] for i in X] #Log.info("===================Columns passed out: ================") out_names = [ model._model_json['output']['coefficients_table'].cell_values[c][0] for c in range(len(X) + 1) ] assert in_names == out_names[1:] if __name__ == "__main__": tests.run_test(sys.argv, benign)
assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows" # Log.info("Slice out a column and data frame it, try dim on it...") h2o_slice = h2o_data[4] np_slice = np_data[:, 4] h2o_rows, h2o_cols = h2o_slice.dim np_rows = np_slice.shape[0] print 'The dimensions of h2o column slice is: {0} x {1}'.format( h2o_rows, h2o_cols) print 'The dimensions of numpy array column slice is: {0} x 1'.format( np_rows) assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows" # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...") h2oColAmpFive = h2o_slice & 5 assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged" if __name__ == "__main__": tests.run_test(sys.argv, dim_checks)
import sys sys.path.insert(1, "../../") import h2o, tests def http_import(): url = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip" aa = h2o.import_file(path=url) aa.show() if __name__ == "__main__": tests.run_test(sys.argv, http_import)
import sys sys.path.insert(1, "../../") import h2o, tests import random def pyunit_remove_vecs(): # TODO PUBDEV-1789 pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) rows, cols = pros.dim remove = random.randint(1,5) p1 = pros.remove_vecs(cols=random.sample(range(cols),remove)) new_rows, new_cols = p1.dim assert new_rows == rows and new_cols == cols-remove, "Expected {0} rows and {1} columns, but got {2} rows and {3} " \ "columns.".format(rows,cols,new_rows,new_cols) remove = random.randint(1,5) p1 = pros.remove_vecs(cols=random.sample(pros.names,remove)) new_rows, new_cols = p1.dim assert new_rows == rows and new_cols == cols-remove, "Expected {0} rows and {1} columns, but got {2} rows and {3} " \ "columns.".format(rows,cols,new_rows,new_cols) if __name__ == "__main__": tests.run_test(sys.argv, pyunit_remove_vecs)
# Training set has two predictor columns # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O DRF without Noise Column #Log.info("Distributed Random Forest with only Predictor Column") model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model1.show() perf1 = model1.model_performance(swpreds) print(perf1.auc()) # Train H2O DRF Model including Noise Column: #Log.info("Distributed Random Forest including Noise Column") model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500) model2.show() perf2 = model2.model_performance(swpreds) print(perf2.auc()) if __name__ == "__main__": tests.run_test(sys.argv, swpredsRF)
ncols, c) # prostate[int,slice] for ncols in range(1, cols + 1): r, c = prostate[random.randint(0, rows - 1), 0:ncols].dim assert r == 1, "incorrect number of rows. correct: {0}, computed: {1}".format( 1, r) assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format( ncols, c) # prostate[slice,int] for nrows in range(1, 10): r, c = prostate[0:nrows, random.randint(0, cols - 1)].dim assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format( nrows, r) assert c == 1, "incorrect number of cols. correct: {0}, computed: {1}".format( 1, c) # prostate[slice,slice] for nrows in range(1, 10): for ncols in range(1, cols + 1): r, c = prostate[0:nrows, 0:ncols].dim assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format( nrows, r) assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format( ncols, c) if __name__ == "__main__": tests.run_test(sys.argv, slicing_shape)
df_hex.summary() assert (not df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (not df_hex['h3'].isfactor()) df_hex['h1'] = df_hex['h1'].asfactor() df_hex['h2'] = df_hex['h2'].asfactor() df_hex['h3'] = df_hex['h3'].asfactor() df_hex.show() df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor()) df_hex['h1'] = df_hex['h1'].asnumeric() df_hex['h2'] = df_hex['h2'].asnumeric() df_hex['h3'] = df_hex['h3'].asnumeric() df_hex.show() df_hex.summary() assert (not df_hex['h1'].isfactor()) assert (not df_hex['h2'].isfactor()) assert (not df_hex['h3'].isfactor()) if __name__ == "__main__": tests.run_test(sys.argv, continuous_or_categorical)
print py_dict_to_h2o_2.describe() # using collections.OrderedDict import collections d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]} # still unordered! py_ordered_dict_to_h2o = H2OFrame(python_obj=collections.OrderedDict(d)) py_ordered_dict_to_h2o.describe() # make an ordered dictionary! d2 = collections.OrderedDict() d2["colA"] = ["bilbo", "baggins"] d2["colB"] = ["meow"] py_ordered_dict_to_h2o_2 = H2OFrame(python_obj=collections.OrderedDict(d2)) py_ordered_dict_to_h2o_2.describe() # numpy.array # import numpy as np # # py_numpy_ary_to_h2o = H2OFrame(python_obj=np.ones((50, 100), dtype=int)) # # py_numpy_ary_to_h2o.describe() if __name__ == "__main__": tests.run_test(sys.argv, upload_file)
# res.show() # assert False, "expected error. objects of different dimensions not supported." #except EnvironmentError: # pass #vec/vec res = iris[0] > iris[1] res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 150, "wrong number of rows returned" # frame/frame res = iris > iris res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" res = iris[0:2] > iris[1:3] res_rows, res_cols = res.dim assert res_rows == rows and res_cols == 2, "dimension mismatch" #try: # res = iris > iris[0:3] # res.show() # assert False, "expected error. frames are different dimensions." #except EnvironmentError: # pass if __name__ == "__main__": tests.run_test(sys.argv, binop_gt)
import sys sys.path.insert(1, "../../") import h2o, tests def hist_test(ip, port): kwargs = {} kwargs['server'] = True print "Import small prostate dataset" hex = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv")) hex["AGE"].hist(**kwargs) hex["VOL"].hist(**kwargs) if __name__ == "__main__": tests.run_test(sys.argv, hist_test)
import sys sys.path.insert(1, "../../../") import h2o, tests def vec_slicing(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) iris.show() ################################################################### # H2OVec[int] res = 2 - iris res2 = res[0] assert abs(res2[3,0] - -2.6) < 1e-10 and abs(res2[17,0] - -3.1) < 1e-10 and abs(res2[24,0] - -2.8) < 1e-10, "incorrect values" # H2OVec[slice] res = iris[12:25,1] assert abs(res[0,0] - 3.0) < 1e-10 and abs(res[1,0] - 3.0) < 1e-10 and abs(res[5,0] - 3.5) < 1e-10, "incorrect values" if __name__ == "__main__": tests.run_test(sys.argv, vec_slicing)
import os, sys sys.path.insert(1, "../../../") import h2o, tests def deeplearning_multi(): print( "Test checks if Deep Learning works fine with a multiclass training and test dataset" ) prostate = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv")) prostate[4] = prostate[4].asfactor() hh = h2o.deeplearning(x=prostate[0:2], y=prostate[4], validation_x=prostate[0:2], validation_y=prostate[4], loss='CrossEntropy') hh.show() if __name__ == '__main__': tests.run_test(sys.argv, deeplearning_multi)
from sklearn.preprocessing import Imputer def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2,7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_ if __name__ == "__main__": tests.run_test(sys.argv, get_modelKmeans)
app_name = config.get('application', 'name') application = service.Application(app_name) log_file = config.get('log', 'file') log_path = config.get('log', 'directory') log_level = config.get('log', 'level') logfile = CustomDailyLogFile(log_file, log_path) application.setComponent( ILogObserver, log.FileLogObserver(logfile, log_level, exclude_systems=[]).emit) if __name__ == '__main__': app_config = { 'no_save': True, 'nodaemon': False, 'profile': False, 'debug': False } oldstdout = sys.stdout oldstderr = sys.stderr profiler = app.AppProfiler(app_config) logger = app.AppLogger(app_config) logger.start(application) sys.stdout = oldstdout run_test() logger.stop()
import sys sys.path.insert(1, "../../../") import h2o, tests def anyfactor(): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) # frame (positive example) assert iris.anyfactor(), "Expected true, but got false. Column 5 is a factor." # frame (negative example) assert not iris[:,:4].anyfactor(), "Expected false, but got true. Columns 1-4 are numeric." # vec (positive example) assert iris[4].anyfactor(), "Expected true, but got false. Column 5 is a factor." # vec (negative example) assert not iris[0].anyfactor(), "Expected false, but got true. Columns 1 is numeric." if __name__ == "__main__": tests.run_test(sys.argv, anyfactor)
assert pros[2, 0] == 60, "Incorrect slicing result" assert pros[3, 0] == 62, "Incorrect slicing result" assert pros[4, 0] == 71, "Incorrect slicing result" assert pros[5, 0] == 67, "Incorrect slicing result" # prostate [int,slice] case # 189,1,69,1,3,2,8,31.2,6 pros = prostate[188, 0:3] assert pros[0, 0] == 189, "Incorrect slicing result" assert pros[0, 1] + 1 == 2, "Incorrect slicing result" assert pros[0, 2] == 69, "Incorrect slicing result" # prostate [slice,slice] case # 84,0,75,1,2,1,11,35,7 # 85,0,75,1,1,1,9.9,15.4,7 # 86,1,75,1,3,1,3.7,0,6 pros = prostate[83:86, 1:4] assert pros[0, 0] == 0, "Incorrect slicing result" assert pros[0, 1] == 75, "Incorrect slicing result" assert pros[0, 2] - 1 == 0, "Incorrect slicing result" assert pros[1, 0] == 0, "Incorrect slicing result" assert pros[1, 1] + 75 == 150, "Incorrect slicing result" assert pros[1, 2] == 1, "Incorrect slicing result" assert pros[2, 0] + 1 == 2, "Incorrect slicing result" assert pros[2, 1] == 75, "Incorrect slicing result" assert pros[2, 2] == 1, "Incorrect slicing result" if __name__ == "__main__": tests.run_test(sys.argv, multi_dim_slicing)
mul_metric_diff) # Clustering metric json df = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats'] clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff) if __name__ == "__main__": tests.run_test(sys.argv, metric_json_check)
try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(NUM_LOSS, 1), loss_by_col_idx=rd.sample(CAT_COLS, 1)) assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(CAT_LOSS, 1), loss_by_col_idx=rd.sample(NUM_COLS, 1)) assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column" except: pass print "Run GLRM with loss_by_col = [" + ', '.join( loss_all) + "] and loss_by_col_idx = [" + ', '.join( [str(a) for a in loss_idx_all]) + "]" glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all, loss_by_col_idx=loss_idx_all) glrm_h2o.show() if __name__ == "__main__": tests.run_test(sys.argv, glrm_set_loss_by_col_rand)
epochs=1, reproducible=True, #slow, turn off for real problems seed=1234) # conver train_supervised with autoencoder to lower-dimensional space train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0) assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!" # Train DRF on extracted feature space drf_model = h2o.random_forest(x=train_supervised_features[0:20], y=train_supervised[resp], ntrees=10, min_rows=10, seed=1234) # Test the DRF model on the test set (processed through deep features) test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0) test_features = test_features.cbind(test_hex[resp])._frame() # Confusion Matrix and assertion cm = drf_model.confusion_matrix(test_features) cm.show() # 10% error +/- 0.001 assert abs(cm.cell_values[10][10] - 0.082) < 0.001, "Error. Expected 0.082, but got {0}".format(cm.cell_values[10][10]) if __name__ == '__main__': tests.run_test(sys.argv, deeplearning_autoencoder)
################################################################################ ## ## Verifying that Python can support user-specified strings to be treated as ## missing. ## ################################################################################ import sys, urllib sys.path.insert(1, "../../") import h2o, tests def na_strings(): path = "smalldata/jira/hexdev_29.csv" fhex = h2o.import_file(tests.locate(path)) fhex.summary() fhex_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex._id) + "/summary")["frames"][0]["columns"] fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary]) fhex_na_strings = h2o.import_file(tests.locate(path), na_strings=[[],["fish", "xyz"],[]]) fhex_na_strings.summary() fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"] fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary]) assert fhex_missing_count == 0 assert fhex_na_strings_missing_count == 2 if __name__ == "__main__": tests.run_test(sys.argv, na_strings)
from sklearn.cluster import KMeans from sklearn.preprocessing import Imputer def benignKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = "")) for i in range(1,7): benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i) print "H2O centers" print benign_h2o_km.centers() benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1) benign_sci_km.fit(benign_sci) print "sckit centers" print benign_sci_km.cluster_centers_ if __name__ == "__main__": tests.run_test(sys.argv, benignKmeans)
################################################################################ ## ## Verifying that Python can define features as categorical or continuous on import ## ################################################################################ import sys, os sys.path.insert(1, "../../") import h2o, tests def continuous_or_categorical(): fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor()) if __name__ == "__main__": tests.run_test(sys.argv, continuous_or_categorical)
h2o_zero_weights.set_names(["weights"]) h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights) h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1] print "\n\nChecking that using some zero weights is equivalent to removing those observations:" check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1) # doubled weights same as doubled observations doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(406)] h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights) h2o_doubled_weights.set_names(["weights"]) h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights): if w[0] == 2: doubled_data.append(doubled_data[idx]) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) h2o_data_doubled.set_names(colnames) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled["economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights["economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights["cylinders"].asfactor() print "\n\nChecking that doubling some weights is equivalent to doubling those observations:" check_same(h2o_data_doubled, h2o_data_doubled_weights, 1) if __name__ == "__main__": tests.run_test(sys.argv, weights_var_imp)
nfolds=cars.nrow + 1, family=family, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", family=family, training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # # 4. fold_column and fold_assignment both specified # try: # glm = h2o.glm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", # family=family, training_frame=cars) # assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" # except EnvironmentError: # assert True if __name__ == "__main__": tests.run_test(sys.argv, cv_carsGLM)
res = iris[0] == 4.7 res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 2, "wrong number of rows returned" res = 3.5 == iris[1] res_rows = res.nrow assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow assert new_rows == 6, "wrong number of rows returned" # frame/frame res = iris == iris res_rows, res_cols = res.dim assert res_rows == rows and res_cols == cols, "dimension mismatch" res = iris[0:2] == iris[1:3] res_rows, res_cols = res.dim assert res_rows == rows and res_cols == 2, "dimension mismatch" #try: # res = iris == iris[0:3] # res.show() # assert False, "expected error. frames are different dimensions." #except EnvironmentError: # pass if __name__ == "__main__": tests.run_test(sys.argv, binop_eq)
import sys sys.path.insert(1, "../../") import h2o, tests def sub_gsub_check(): # Connect to a pre-existing cluster frame = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"), col_types=["numeric","numeric","numeric","numeric","string"]) # single column (frame) frame["C5"] = frame["C5"].gsub("s", "z") assert frame[0,4] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(frame[0,4]) frame["C5"]= frame["C5"].sub("z", "s") assert frame[1,4] == "Iris-zetoza", "Expected 'Iris-zetoza', but got {0}".format(frame[1,4]) # single column (vec) vec = frame["C5"] vec = vec.sub("z", "s") assert vec[2,0] == "Iris-setoza", "Expected 'Iris-setoza', but got {0}".format(vec[2,0]) vec = vec.gsub("s", "z") assert vec[3,0] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(vec[3,0]) if __name__ == "__main__": tests.run_test(sys.argv, sub_gsub_check)
import numpy as np def wide_dataset_large(): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ') trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ') trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ') validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ') validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance" if __name__ == "__main__": tests.run_test(sys.argv, wide_dataset_large)
import h2o, tests def expr_as_list(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) # multiple rows and columns res = 2 - iris res = h2o.as_list(res, use_pandas=False) assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[5][1]) - -1.6) < 1e-10 and \ abs(float(res[11][2]) - 0.5) < 1e-10, "incorrect values" # single column res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[18][0]) - -3.1) < 1e-10 and \ abs(float(res[25][0]) - -2.8) < 1e-10, "incorrect values" # local data frm = h2o.as_list(h2o.H2OFrame(python_obj=[1,2,3]), use_pandas=False) assert float(frm[1][2]) == 3, "incorrect values" frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1,2,3], [4,5,6]]), use_pandas=False) assert float(frm[2][1]) == 5, "incorrect values" if __name__ == "__main__": tests.run_test(sys.argv, expr_as_list)
import sys sys.path.insert(1, "../../../") import h2o, tests def demo_glm(): h2o.demo(func="glm", interactive=False, test=True) if __name__ == "__main__": tests.run_test(sys.argv, demo_glm)
sys.path.insert(1, "../../") import h2o, tests def upload_import_small(): # Connect to a pre-existing cluster various_datasets = [ "smalldata/iris/iris.csv", "smalldata/iris/iris_wheader.csv", "smalldata/prostate/prostate.csv", "smalldata/prostate/prostate_woheader.csv.gz" ] for dataset in various_datasets: uploaded_frame = h2o.upload_file(tests.locate(dataset)) imported_frame = h2o.import_file(tests.locate(dataset)) rows_u, cols_u = uploaded_frame.dim rows_i, cols_i = imported_frame.dim assert rows_u == rows_i, "Expected same number of rows regardless of method. upload: {0}, import: " \ "{1}.".format(rows_u, rows_i) assert cols_u == cols_i, "Expected same number of cols regardless of method. upload: {0}, import: " \ "{1}.".format(cols_u, cols_i) if __name__ == "__main__": tests.run_test(sys.argv, upload_import_small)
import sys sys.path.insert(1, "../../") import h2o, tests def hist_test(ip,port): kwargs = {} kwargs['server'] = True print "Import small prostate dataset" hex = h2o.import_file(h2o.locate("smalldata/logreg/prostate.csv")) hex["AGE"].hist(**kwargs) hex["VOL"].hist(**kwargs) if __name__ == "__main__": tests.run_test(sys.argv, hist_test)
print print "======================================================================" print "============================== Binomial ==============================" print "======================================================================" for i in range(10): attack(pros_train, pros_valid, random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1) print print "======================================================================" print "============================== Gaussian ==============================" print "======================================================================" for i in range(10): attack(cars_train, cars_valid, random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1) print print "======================================================================" print "============================= Multinomial ============================" print "======================================================================" cars_train[2] = cars_train[2].asfactor() cars_valid[2] = cars_valid[2].asfactor() for i in range(10): attack(cars_train, cars_valid, random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2) if __name__ == "__main__": tests.run_test(sys.argv, random_attack)
# Connect to a pre-existing cluster insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv")) insurance["offset"] = insurance["Holders"].log() gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1, learn_rate=.1, offset_column="offset", training_frame=insurance) predictions = gbm.predict(insurance) # Comparison result generated from R's gbm: # fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1, # shrinkage = .1,bag.fraction = 1,train.fraction = 1, # data = Insurance, distribution ="gaussian", n.trees = 600) # pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600) # pr = pg - - log(Insurance$Holders) assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \ format(44.33016, gbm._model_json['output']['init_f']) assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse()) assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \ format(49.23438, predictions.mean()) assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \ format(-45.5720659304, predictions.min()) assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \ format(207.387, predictions.max()) if __name__ == "__main__": tests.run_test(sys.argv, offset_gaussian)
cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print cross1_km print "Run k-means with init = final cluster centers and max_iterations = 1" init_centers = h2o.H2OFrame(cross1_km.centers()) init_centers_key = init_centers.send_frame() cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers_key, max_iterations=1) print cross2_km print "Check k-means converged or maximum iterations reached" c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = ((c1 - c2)**2).sum() / ncent iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: print "Not running on H2O internal network. No access to HDFS." if __name__ == "__main__": tests.run_test(sys.argv, hdfs_kmeans_converge)
import pandas as pd import numpy as np def group_by(): # Connect to a pre-existing cluster h2o_iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names[0:4] print "Running smoke test" # smoke test for na in na_handling: grouped = h2o_iris.group_by("class") grouped \ .count(na=na) \ .min( na=na) \ .max( na=na) \ .mean( na=na) \ .var( na=na) \ .sd( na=na) \ .ss( na=na) \ .sum( na=na) print grouped.get_frame() if __name__ == "__main__": tests.run_test(sys.argv, group_by)
import sys sys.path.insert(1, "../../") import h2o, tests def spaces_in_column_names(): train_data = h2o.upload_file( path=tests.locate("smalldata/jira/spaces_in_column_names.csv")) train_data.show() train_data.describe() X = [ "p r e d i c t o r 1", "predictor2", "p r e d i ctor3", "pre d ictor4", "predictor5" ] gbm = h2o.gbm(x=train_data[X], y=train_data["r e s p o n s e"].asfactor(), ntrees=1, distribution="bernoulli", min_rows=1) gbm.show() if __name__ == "__main__": tests.run_test(sys.argv, spaces_in_column_names)
# Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors) #Log.info("Importing swpreds_1000x3.csv data...\n") swpreds = h2o.import_file(path=tests.locate("smalldata/gbm_test/swpreds_1000x3.csv")) swpreds["y"] = swpreds["y"].asfactor() #Log.info("Summary of swpreds_1000x3.csv from H2O:\n") #swpreds.summary() # Train H2O GBM without Noise Column #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n") h2o_gbm_model1 = h2o.gbm(x=swpreds[["X1"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20, nbins=500) h2o_gbm_model1.show() h2o_gbm_perf1 = h2o_gbm_model1.model_performance(swpreds) h2o_auc1 = h2o_gbm_perf1.auc() # Train H2O GBM Model including Noise Column: #Log.info("H2O GBM with parameters:\nntrees = 50, max_depth = 20, nbins = 500\n") h2o_gbm_model2 = h2o.gbm(x=swpreds[["X1","X2"]], y=swpreds["y"], distribution="bernoulli", ntrees=50, max_depth=20, nbins=500) h2o_gbm_model2.show() h2o_gbm_perf2 = h2o_gbm_model2.model_performance(swpreds) h2o_auc2 = h2o_gbm_perf2.auc() if __name__ == "__main__": tests.run_test(sys.argv, swpredsGBM)
################################################################################ ## ## Verifying that Python can support importing without parsing. ## ################################################################################ import sys, os sys.path.insert(1, "../../") import h2o, tests def parse_false(): fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), parse=False) assert isinstance(fraw, list) fhex = h2o.parse_raw(h2o.parse_setup(fraw)) fhex.summary() assert fhex.__class__.__name__ == "H2OFrame" if __name__ == "__main__": tests.run_test(sys.argv, parse_false)
import sys sys.path.insert(1, "../../") import h2o, tests def score_history_test(ip,port): air_train = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) gbm_mult = h2o.gbm(x=air_train[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth","fMonth"]], y=air_train["fDayOfWeek"].asfactor(), distribution="multinomial") score_history = gbm_mult.score_history() print score_history if __name__ == "__main__": tests.run_test(sys.argv, score_history_test)
u'frame_checksum' ] mul_metric_diff = list( set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = clus_met._metric_json.keys() clus_metric_json_keys_desired = [ u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'centroid_stats' ] clus_metric_diff = list( set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff) if __name__ == "__main__": tests.run_test(sys.argv, metric_json_check)
## # Test out the sdev() functionality # If NAs in the frame, they are skipped in calculation unless na.rm = F # If any categorical columns, throw an error ## import sys sys.path.insert(1, "../../../") import h2o, tests import numpy as np def sdev(ip,port): iris_h2o = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = iris_h2o[i].sd() assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same" iris_h2o[0:2].sd() if __name__ == "__main__": tests.run_test(sys.argv, sdev)
import sys sys.path.insert(1, "../../") import h2o, tests def expr_show(): iris = h2o.import_file( path=tests.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### # expr[int], expr._data is pending res = 2 - iris res2 = res[0] print "res2:" res2.show() # expr[int], expr._data is remote res3 = res[0] print "res3:" res3.show() if __name__ == "__main__": tests.run_test(sys.argv, expr_show)
import sys sys.path.insert(1, "../../") import h2o, tests def varimp_test(ip, port): train = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) # Run GBM my_gbm = h2o.gbm(y=train["class"], x=train[1:4], ntrees=50, learn_rate=0.1, distribution="multinomial") should_be_none = my_gbm.varimp() assert should_be_none is None, "expected varimp to return None, but returned {0}".format(should_be_none) should_be_list = my_gbm.varimp(return_list=True) assert len(should_be_list) == 3, "expected varimp list to contain 3 entries, but it has " "{0}".format( len(should_be_list) ) assert len(should_be_list[0]) == 4, ( "expected varimp entry to contain 4 elements (variable, relative_importance, " "scaled_importance, percentage), but it has {0}".format(len(should_be_list[0])) ) if __name__ == "__main__": tests.run_test(sys.argv, varimp_test)
import sys sys.path.insert(1, "../../../") import h2o, tests def grid_wineGBM(ip,port): wine = h2o.import_file(path=h2o.locate("smalldata/gbm_test/wine.data")) #wine.summary() x_cols = range(2,14) + [0] wine_grid = h2o.gbm(y=wine[1], x=wine[x_cols], distribution='gaussian', ntrees=[5,10,15], max_depth=[2,3,4], learn_rate=[0.1,0.2]) wine_grid.show() if __name__ == "__main__": tests.run_test(sys.argv, grid_wineGBM)
print zz.show() zz = fr.apply(lambda row: h2o.ifelse(row[0] == 1, row[2], row[3]), axis=1) print zz.show() fr.apply(lambda col: col.abs()).show() fr.apply(lambda col: col.cos()).show() fr.apply(lambda col: col.sin()).show() fr.apply(lambda col: col.ceil()).show() fr.apply(lambda col: col.floor()).show() fr.apply(lambda col: col.cosh()).show() fr.apply(lambda col: col.exp()).show() fr.apply(lambda col: col.log()).show() fr.apply(lambda col: col.sqrt()).show() fr.apply(lambda col: col.tan()).show() fr.apply(lambda col: col.tanh()).show() fr.apply(lambda col: (col*col - col*5*col).abs() - 55/col ).show() fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), (row[2] - 999).expm1()), axis=1) fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), 55), axis=1) fr.apply(lambda row: h2o.ifelse(row[0] < 5, 3, (row[2] - 1).expm1()), axis=1) if __name__ == "__main__": tests.run_test(sys.argv, pyunit_apply)
import sys sys.path.insert(1, "../../../") import h2o, tests def bigcatGBM(): #Log.info("Importing bigcat_5000x2.csv data...\n") bigcat = h2o.import_file(path=tests.locate("smalldata/gbm_test/bigcat_5000x2.csv")) bigcat["y"] = bigcat["y"].asfactor() #Log.info("Summary of bigcat_5000x2.csv from H2O:\n") #bigcat.summary() # Train H2O GBM Model: #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n") model = h2o.gbm(x=bigcat[["X"]], y = bigcat["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100) model.show() performance = model.model_performance(bigcat) performance.show() # Check AUC and overall prediction error #test_accuracy = performance.accuracy() test_auc = performance.auc() if __name__ == "__main__": tests.run_test(sys.argv, bigcatGBM)
h2o.locate("smalldata/iris/iris_wheader.csv"), col_types=["numeric", "numeric", "numeric", "numeric", "string"] ) # import data assembly = H2OAssembly( steps=[ ("col_select", H2OColSelect(["sepal_len", "petal_len", "class"])), # col selection ("cos_sep_len", H2OColOp(fun=H2OFrame.cos, col="sepal_len", inplace=True)), # math operation ("str_cnt_species", H2OColOp(fun=H2OFrame.countmatches, col="class", inplace=False, pattern="s")), ] ) # string operation result = assembly.fit(fr) # fit the assembly result.show() # show the result of the fit assembly.to_pojo("MungingPojoDemo") # , path="/Users/spencer/Desktop/munging_pojo") # export POJO # java api usage: # # String rawRow = framework.nextTuple(); # H2OMungingPOJO munger = new GeneratedH2OMungingPojo_001(); # EasyPredictModelWrapper model = new EasyPredictModelWrapper(new GeneratedH2OGbmPojo_001()); # # RowData row = new RowData(); # row.fill(rawRow); # row = munger.fit(row); # BinomialModelPrediction pred = model.predictBinomial(row); # // Use prediction! if __name__ == "__main__": tests.run_test(sys.argv, assembly_demo)
print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id, seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], seed=1234) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True)) if __name__ == "__main__": tests.run_test(sys.argv, milsong_checkpoint)