h2o.kmeans(x=frame, k=5) assert False, "expected an error" except EnvironmentError: assert True # Log.info("Training data with a categorical column(s)") data = [[random.choice(string.ascii_uppercase) for c in range(cols)] for r in range(rows)] frame = h2o.H2OFrame(data) km_model = h2o.kmeans(x=frame, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len(centers[c]) == 10, "expected center to be 10 " + str( len(centers[c])) # Log.info("Importing iris.csv data...\n") iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) km_model = h2o.kmeans(x=iris, k=5) centers = km_model.centers() assert len(centers) == 5, "expected 5 centers" for c in range(len(centers)): assert len( centers[c]) == 5, "expected center to be 5 " + str(len(centers[c])) if __name__ == "__main__": h2o.run_test(sys.argv, baddataKmeans)
row_sum = 0 for level in air.levels(16): if level == "ANC": continue r, c = air[str(level) == air["Origin"]].dim() row_sum = row_sum + r assert row_sum == rows - 1, "expected equal number of rows" # ==, != jan = air[1 == air["Month"]] not_jan = air[1 != air["Month"]] no_rows, no_cols = not_jan.dim() yes_rows, yes_cols = jan.dim() assert (no_rows + yes_rows) == rows and no_cols == yes_cols == cols, "expected equal number of rows and cols" # >, <= g = air[1990 <= air["Year"]] L = air[1990 > air["Year"]] g_rows, g_cols = g.dim() L_rows, L_cols = L.dim() assert (L_rows + g_rows) == rows and L_cols == g_cols == cols, "expected equal number of rows and cols" # >=, < G = air[15 < air["DayofMonth"]] l = air[15 >= air["DayofMonth"]] G_rows, G_cols = G.dim() l_rows, l_cols = l.dim() assert (l_rows + G_rows) == rows and l_cols == G_cols == cols, "expected equal number of rows and cols" if __name__ == "__main__": h2o.run_test(sys.argv, vec_scaler_comparisons)
################################################################### # H2OFrame[int] (column slice) res = 2 - iris res2 = res[0] assert abs(res2[3,:] - -2.6) < 1e-10 and abs(res2[17,:] - -3.1) < 1e-10 and abs(res2[24,:] - -2.8) < 1e-10, \ "incorrect values" # H2OFrame[int,int] assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values" # H2OFrame[int, slice] res4 = res[12, 0:4] assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \ abs(res4[0,3] - 1.9) < 1e-10, "incorrect values" # H2OFrame[slice, int] res5 = res[5:9, 1] assert abs(res5[0,:] - -1.9) < 1e-10 and abs(res5[1,:] - -1.4) < 1e-10 and abs(res5[2,:] - -1.4) < 1e-10 and \ abs(res5[3,:] - -0.9) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res = iris * 2 res6 = res[5:9, 0:4] assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \ abs(res6[3,3] - 0.4) < 1e-10, "incorrect values" if __name__ == "__main__": h2o.run_test(sys.argv, expr_slicing)
import sys sys.path.insert(1, "../../") import h2o def frame_show(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() if __name__ == "__main__": h2o.run_test(sys.argv, frame_show)
import sys sys.path.insert(1, "../../") import h2o import random import numpy as np def quantile(ip,port): # Connect to a pre-existing cluster data = [[random.uniform(-10000,10000)] for c in range(1000)] h2o_data = h2o.H2OFrame(python_obj=data) np_data = np.array(data) h2o_quants = h2o_data.quantile() np_quants = np.percentile(np_data,[1, 10, 25, 33.3, 50, 66.7, 75, 90, 99],axis=0) for e in range(9): h2o_val = h2o_quants[e,1] np_val = np_quants[e][0] assert abs(h2o_val - np_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal quantile values between h2o " \ "and numpy".format(h2o_val,np_val) if __name__ == "__main__": h2o.run_test(sys.argv, quantile)
# # # d. jagged # python_obj = np.array([[6,7,8,9,10], [1,2,3,4], [3,2,2]]) # the_frame = h2o.H2OFrame(python_obj=python_obj) # # check_dims_values_jagged() TODO # # ## 6. pandas.DataFrame # # a. single row # python_obj = pd.DataFrame({'foo' : pd.Series([1]), 'bar' : pd.Series([6]), 'baz' : pd.Series(["a"]) }) # the_frame = h2o.H2OFrame(python_obj=python_obj) # h2o.check_dims_values(python_obj, the_frame, rows=1, cols=3) # # # b. single column # python_obj = pd.DataFrame({'foo' : pd.Series([1, 2, 3, 7.8, 9])}) # the_frame = h2o.H2OFrame(python_obj=python_obj) # h2o.check_dims_values(python_obj, the_frame, rows=5, cols=1) # # # c. multiple rows, columns # python_obj = pd.DataFrame({'foo' : pd.Series([6,7,8,9,10]), 'bar' : pd.Series([1,2,3,4,5]), # 'baz' : pd.Series([3,2,2,2,2])}) # the_frame = h2o.H2OFrame(python_obj=python_obj) # h2o.check_dims_values(python_obj, the_frame, rows=5, cols=3) # # # d. jagged # python_obj = pd.DataFrame({'foo' : pd.Series([6,7,8]), 'bar' : pd.Series([1,2,3,4,5]), 'baz' : pd.Series([3,2,2,2])}) # the_frame = h2o.H2OFrame(python_obj=python_obj) # # check_dims_values_jagged() TODO if __name__ == "__main__": h2o.run_test(sys.argv, to_H2OFrame)
gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(X_train,y_train) # Generate testing dataset test_rows = 2000 test_cols = 10 # Generate variables V1, ... V10 X_test = np.random.randn(test_rows, test_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test,X_test).tolist()]]) # Score (AUC) the scikit gbm model on the test data auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1]) # Compare this result to H2O train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist()) test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist()) gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"], distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins) gbm_perf = gbm_h2o.model_performance(test_h2o) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit" if __name__ == "__main__": h2o.run_test(sys.argv, bernoulli_synthetic_data_mediumGBM)
import h2o def vi_toy_test(ip, port): toy_data = h2o.import_file( path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv")) #toy_data.summary() toy_data[6] = toy_data[6].asfactor() toy_data.show() rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]], y=toy_data[6], ntrees=500, max_depth=20, nbins=100, seed=0) ranking = [ rf._model_json['output']['variable_importances'].cell_values[v][0] for v in range(toy_data.ncol() - 1) ] print(ranking) assert tuple(ranking) == tuple( ["V3", "V2", "V6", "V5", "V1", "V4"]), "expected specific variable importance ranking" if __name__ == "__main__": h2o.run_test(sys.argv, vi_toy_test)
def parametersKmeans(ip, port): # Connect to a pre-existing cluster h2o.init(ip, port) # connect to localhost:54321 #Log.info("Getting data...") iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) #Log.info("Create and and duplicate...") iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234) parameters = iris_km._model_json['parameters'] param_dict = {} for p in range(len(parameters)): param_dict[parameters[p]['label']] = parameters[p]['actual_value'] iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict) #Log.info("wmse") wmse = iris_km.within_mse().sort() wmse_again = iris_km_again.within_mse().sort() assert wmse == wmse_again, "expected wmse to be equal" #Log.info("centers") centers = iris_km.centers() centers_again = iris_km_again.centers() assert centers == centers_again, "expected centers to be the same" if __name__ == "__main__": h2o.run_test(sys.argv, parametersKmeans)
import sys, os sys.path.insert(1, "../../") import h2o import random def download_csv(ip,port): iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.download_csv(iris1,"iris_delete.csv") iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv")) os.remove("iris_delete.csv") rand_row = random.randint(0,iris1.nrow()-1) rand_col = random.randint(0,3) assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \ "be the same, but got {0} and {1}" \ "".format(iris1[rand_row, rand_col], iris2[rand_row, rand_col]) if __name__ == "__main__": h2o.run_test(sys.argv, download_csv)
myX = [ "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek" ] myY = "IsDepDelayed" air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY], distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01) # Plot ROC for training and validation sets air_gbm.plot(type="roc", train=True, **kwargs) air_gbm.plot(type="roc", valid=True, **kwargs) air_test = h2o.import_frame( h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) perf = air_gbm.model_performance(air_test) #Plot ROC for test set perf.plot(type="roc", **kwargs) if __name__ == "__main__": h2o.run_test(sys.argv, plot_test)
k=ncent, user_points=centers_key, max_iterations=1) centers = h2o.H2OFrame(rep_fit.centers()) centers_key = centers.send_frame() # Log.info(paste("Run k-means with max_iter=miters")) all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start.eager(), max_iterations=miters) assert rep_fit.centers() == all_fit.centers( ), "expected the centers to be the same" # Log.info("Check cluster centers have converged") all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame( all_fit.centers()).send_frame(), max_iterations=1) avg_change = sum([ sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)]) for c1, c2 in zip(all_fit.centers(), all_fit2.centers()) ]) / ncent assert avg_change < 1e-6 or all_fit._model_json['output'][ 'iterations'] < miters if __name__ == "__main__": h2o.run_test(sys.argv, convergeKmeans)
import sys sys.path.insert(1, "../../../") import h2o def frame_as_list(ip, port): # Connect to h2o h2o.init(ip, port) prostate = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate.csv.zip")) print(prostate % 10).show() print(prostate[4] % 10).show() airlines = h2o.import_frame( path=h2o.locate("smalldata/airlines/allyears2k_headers.zip")) print(airlines["CRSArrTime"] % 100).show() if __name__ == "__main__": h2o.run_test(sys.argv, frame_as_list)
def frame_reducers(ip, port): # Connect to h2o h2o.init(ip, port) data = [[random.uniform(-10000, 10000) for r in range(10)] for c in range(10)] h2o_data = h2o.H2OFrame(python_obj=data) np_data = np.array(data) h2o_val = h2o_data.min() num_val = np.min(np_data) assert abs(h2o_val - num_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal min values between h2o and " \ "numpy".format(h2o_val,num_val) h2o_val = h2o_data.max() num_val = np.max(np_data) assert abs(h2o_val - num_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal max values between h2o and " \ "numpy".format(h2o_val,num_val) h2o_val = h2o_data.sum() num_val = np.sum(np_data) assert abs(h2o_val - num_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and numpy computed {1}. expected equal sum values between h2o and " \ "numpy".format(h2o_val,num_val) #h2o.np_comparison_check(h2o.var(h2o_data), np.cov(np_data, rowvar=0, ddof=1), 10) if __name__ == "__main__": h2o.run_test(sys.argv, frame_reducers)
import sys sys.path.insert(1, "../../../") import h2o def pca_scoring(ip, port): print "Importing arrests.csv data..." arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) print "Run PCA with transform = 'DEMEAN'" fitH2O = h2o.prcomp(x=arrestsH2O[0:4], k = 4, transform = "DEMEAN") # TODO: fitH2O.show() print "Project training data into eigenvector subspace" predH2O = fitH2O.predict(arrestsH2O) print "H2O Projection:" print predH2O.head() if __name__ == "__main__": h2o.run_test(sys.argv, pca_scoring)
# Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = h2o.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_frame(url) n = airlines_h2o.nrow() print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9] airlines_km = h2o.kmeans(training_frame=airlines_h2o, x=airlines_h2o[myX], k=7, init="Furthest", max_iterations=10, standardize=True) print airlines_km else: print "Not running on H2O internal network. No access to HDFS." if __name__ == "__main__": h2o.run_test(sys.argv, hdfs_kmeans_airlines)
iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same" if __name__ == "__main__": h2o.run_test(sys.argv, iris_h2o_vs_sciKmeans)
import sys, os sys.path.insert(1, "../../") import h2o from h2o.model.binomial import H2OBinomialModel def save_load_model(ip, port): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE", "RACE", "PSA", "DCAPS"]], family="binomial", alpha=[0.5]) model_path = h2o.save_model(prostate_glm, name="delete_model", force=True) the_model = h2o.load_model(model_path) assert isinstance( the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format( the_model) if __name__ == "__main__": h2o.run_test(sys.argv, save_load_model)
import sys sys.path.insert(1, "../../../") import h2o def iris_nfolds(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50, nfolds=5) model.show() # Can specify both nfolds >= 2 and validation = H2OParsedData at once try: h2o.random_forest(y=iris[4], x=iris[0:4], validation_y=iris[4], validation_x=iris[0:4], ntrees=50, nfolds=5) assert True except EnvironmentError: assert False, "expected an error" if __name__ == "__main__": h2o.run_test(sys.argv, iris_nfolds)
def nb_init_err(ip, port): print "Importing iris_wheader.csv data...\n" iris = h2o.upload_file(h2o.locate("smalldata/iris/iris_wheader.csv")) iris.describe print "Laplace smoothing parameter is negative" try: h2o.naive_bayes(x=iris[0:4], y=iris[4], laplace=-1) assert False, "Expected naive bayes algo to fail on negative laplace training parameter" except: pass print "Minimum standard deviation is zero" try: h2o.naive_bayes(x=iris[0:4], y=iris[4], min_sdev=0) assert False, "Expected naive bayes algo to fail on min_sdev = 0" except: pass print "Response column is not categorical" try: h2o.naive_bayes(x=iris[0:3], y=iris[3], min_sdev=0) assert False, "Expected naive bayes algo to fail on response not categorical" except: pass if __name__ == "__main__": h2o.run_test(sys.argv, nb_init_err)
import sys sys.path.insert(1, "../../../") import h2o def nb_prostate(ip, port): h2o.init(ip, port) print "Importing prostate.csv data..." prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DCAPS, and DPROS to categorical" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["CAPSULE"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() print "Compare with Naive Bayes when x = 3:9, y = 2" prostate_nb = h2o.naive_bayes(x=prostate[2:9], y=prostate[1], laplace=0) prostate_nb.show() print "Predict on training data" prostate_pred = prostate_nb.predict(prostate) prostate_pred.head() if __name__ == "__main__": h2o.run_test(sys.argv, nb_prostate)
import sys sys.path.insert(1, "../../") import h2o def ls_test(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls() if __name__ == "__main__": h2o.run_test(sys.argv, ls_test)
import sys sys.path.insert(1, "../../") import h2o from h2o.frame import H2OVec def vec_as_list(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0]) assert abs(res[3][0] - 4.6) < 1e-10 and abs(res[5][0] - 5.4) < 1e-10 and abs(res[9][0] - 4.9) < 1e-10, \ "incorrect values" res = 2 - iris res2 = h2o.as_list(H2OVec(name="C0", expr=res[0])) assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \ "incorrect values" res3 = h2o.as_list(H2OVec(name="C1", expr=res[1])) assert abs(res3[3][0] - -1.1) < 1e-10 and abs(res3[5][0] - -1.9) < 1e-10 and abs(res3[9][0] - -1.1) < 1e-10, \ "incorrect values" if __name__ == "__main__": h2o.run_test(sys.argv, vec_as_list)
import sys sys.path.insert(1, "../../") import h2o def rep_len_check(ip, port): # Connect to a pre-existing cluster iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) # data is single column (vec) vec = iris[0].rep_len(length_out=301) assert vec.nrow( ) == 301, "Expected an H2OVec with 301 rows, but got {0} rows".format( vec.nrow()) for r in range(len(vec)): assert vec[r] == vec[r % 150], "Expected {0}, but got {1}".format( vec[r % 150], vec[r]) # data is frame #TODO: there's a NPE bug here #fr = h2o.rep_len(iris, length_out=7) #assert fr.nrow() == 150 and fr.ncol() == 7, "Expected an H2OFrame with 150 rows and 7 columns, but got {0} rows and {1} " \ # "cols".format(fr.nrow(), fr.ncol()) if __name__ == "__main__": h2o.run_test(sys.argv, rep_len_check)
# res = iris ** iris[0:3] # res.show() # assert False, "expected error. frames are different dimensions." #except EnvironmentError: # pass # LHS: H2OFrame, RHS: H2OVec #try: # res = iris ** iris[0] # res.show() # assert False, "expected error. objects of different dimensions not supported." #except EnvironmentError: # pass # LHS: H2OFrame, RHS: scaler res = 1.2 ** iris[2] res2 = iris ** res[63,:] res2.show() # LHS: H2OFrame, RHS: scaler res = iris ** 2 res_rows, res_cols = res.dim() assert res_rows == rows and res_cols == cols, "dimension mismatch" for x, y in zip([res[c].sum() for c in range(cols-1)], [1800.33, 709.32, 382.69, 30.74]): assert abs(x - y) < 1e-2, "expected same values" ################################################################### if __name__ == "__main__": h2o.run_test(sys.argv, binop_pow)
import sys sys.path.insert(1, "../../") import h2o def screeplot_test(ip, port): # Connect to h2o h2o.init(ip, port) kwargs = {} kwargs['server'] = True australia = h2o.upload_file( h2o.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = h2o.prcomp(x=australia[0:8], k=4, transform="STANDARDIZE") australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs) if __name__ == "__main__": h2o.run_test(sys.argv, screeplot_test)
print "H2O Singular Values: {0}".format(h2o_d) for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r) print "Compare right singular vectors (V)" h2o_v = fitH2O._model_json['output']['v'] r_v = [[-0.04239181, 0.01616262, -0.06588426, 0.99679535], [-0.94395706, 0.32068580, 0.06655170, -0.04094568], [-0.30842767, -0.93845891, 0.15496743, 0.01234261], [-0.10963744, -0.12725666, -0.98347101, -0.06760284]] print "R Right Singular Vectors: {0}".format(r_v) print "H2O Right Singular Vectors: {0}".format(h2o_v) for rl, hl in zip(r_v, h2o_v): for r, h in zip(rl, hl): assert abs(abs(r) - abs(h)) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) print "Compare left singular vectors (U)" h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json['output']['u_key']['name']), use_pandas=False) h2o_u.pop(0) r_u = [[-0.1716251, 0.096325710, 0.06515480, 0.15369551], [-0.1891166, 0.173452566, -0.42665785, -0.17801438], [-0.2155930, 0.078998111, 0.02063740, -0.28070784], [-0.1390244, 0.059889811, 0.01392269, 0.01610418], [-0.2067788, -0.009812026, -0.17633244, -0.21867425], [-0.1558794, -0.064555293, -0.28288280, -0.11797419]] print "R Left Singular Vectors: {0}".format(r_u) print "H2O Left Singular Vectors: {0}".format(h2o_u) for rl, hl in zip(r_u, h2o_u): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) if __name__ == "__main__": h2o.run_test(sys.argv, svd_1_golden)
assert c4_imputed == 5, "Wrong value imputed. Expected imputed value of 5, but got {0}".format( c4_imputed) # mode-categorical h2o_data = h2o.H2OFrame(python_obj=data) h2o_data.impute(column="C5", method="mode") c5_imputed = h2o_data[4, 4] assert c5_imputed == 'b', "Wrong value imputed. Expected imputed value of b, but got {0}".format( c5_imputed) # mode-numeric h2o_data = h2o.H2OFrame(python_obj=data) h2o_data.impute(column="C6", method="mode") c6_imputed = h2o_data[5, 5] assert c6_imputed == 1, "Wrong value imputed. Expected imputed value of 1, but got {0}".format( c6_imputed) # mean-group by C7 h2o_data = h2o.H2OFrame(python_obj=data) h2o_data.impute(column="C3", method="mean", by="C7") imputed1 = h2o_data[2, 2] imputed2 = h2o_data[3, 2] assert imputed1 == 3.5, "Wrong value imputed. Expected imputed value of 3.5, but got {0}".format( imputed1) assert imputed2 == 9.5, "Wrong value imputed. Expected imputed value of 9.5, but got {0}".format( imputed2) if __name__ == "__main__": h2o.run_test(sys.argv, impute2)
if method == 3: s = [] for p in range(kwargs['k']): s.append([random.uniform(train[c].mean()-100,train[c].mean()+100) for c in x]) start = h2o.H2OFrame(python_obj=s) kwargs['user_points'] = start else: kwargs['init'] = ["Furthest","Random", "PlusPlus"][method] if random.randint(0,1): kwargs['seed'] = random.randint(1,10000) # display the parameters and their corresponding values print "-----------------------" print "x: {0}".format(x) for k, v in zip(kwargs.keys(), kwargs.values()): if k == 'user_points': print k + ": " start.show() else: print k + ": {0}".format(v) h2o.kmeans(x=train[x], **kwargs) print "-----------------------" print "Import and data munging..." ozone = h2o.import_frame(path=h2o.locate("smalldata/glm_test/ozone.csv")) for i in range(50): attack(ozone, random.sample([0,1,2,3],random.randint(1,4))) if __name__ == "__main__": h2o.run_test(sys.argv, random_attack)
import sys sys.path.insert(1, "../../") import h2o import random def create_frame_test(ip, port): # Connect to h2o h2o.init(ip, port) # REALLY basic test TODO: add more checks r = random.randint(1, 1000) c = random.randint(1, 1000) frame = h2o.create_frame(rows=r, cols=c) assert frame.nrow() == r and frame.ncol() == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \ "cols.".format(r,c,frame.nrow(),frame.ncol()) if __name__ == "__main__": h2o.run_test(sys.argv, create_frame_test)
import sys sys.path.insert(1, "../../") import h2o def frame_as_list(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) res1 = h2o.as_list(iris) assert abs(res1[8][0] - 4.4) < 1e-10 and abs(res1[8][1] - 2.9) < 1e-10 and abs(res1[8][2] - 1.4) < 1e-10, \ "incorrect values" res2 = h2o.as_list(prostate) assert abs(res2[6][0] - 7) < 1e-10 and abs(res2[6][1] - 0) < 1e-10 and abs(res2[6][2] - 68) < 1e-10, \ "incorrect values" res3 = h2o.as_list(airlines) assert abs(res3[3][0] - 1987) < 1e-10 and abs(res3[3][1] - 10) < 1e-10 and abs(res3[3][2] - 18) < 1e-10, \ "incorrect values" if __name__ == "__main__": h2o.run_test(sys.argv, frame_as_list)
#---------------------------------------------------------------------- # Try to slice by using != factor_level #---------------------------------------------------------------------- import sys sys.path.insert(1, "../../") import h2o def not_equal_factor(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) air = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k_headers.zip")) # Print dataset size. rows, cols = air.dim() # # Example 1: Select all flights not departing from SFO # not_sfo = air[air["Origin"] != "SFO"] sfo = air[air["Origin"] == "SFO"] no_rows, no_cols = not_sfo.dim() yes_rows, yes_cols = sfo.dim() assert (no_rows + yes_rows) == rows and no_cols == yes_cols == cols, "dimension mismatch" if __name__ == "__main__": h2o.run_test(sys.argv, not_equal_factor)
assert set(['a', 'b', 'c']) == set(levels), \ "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels) assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels) assert iris[0,4] == 'a' levels = iris[4].levels() nlevels = iris[4].nlevels() assert set(['a', 'b', 'c']) == set(levels), \ "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels) assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels) iris[4] = iris[4].setLevel(level='b') levels = iris.levels(col=4) nlevels = iris.nlevels(col=4) assert set(['a', 'b', 'c']) == set(levels), \ "Expected levels to be {0}, but got {1}".format(set(['a', 'b', 'c']),levels) assert nlevels == 3, "Expected nlevels to be 3, but got {0}".format(nlevels) assert iris[0,4] == 'b' levels = iris[1].levels() nlevels = iris[1].nlevels() assert levels == None, "Expected levels to be None, but got {0}".format(levels) assert nlevels == 0, "Expected nlevels to be 0, but got {0}".format(nlevels) one_column_frame = iris[4] one_column_frame = one_column_frame.setLevel(level='c') assert one_column_frame[0,0] == 'c' if __name__ == "__main__": h2o.run_test(sys.argv, levels_nlevels_setlevel_setLevels_test)
imbalanced_perf.show() balanced = h2o.random_forest(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, nfolds=3) balanced_perf = balanced.model_performance(covtype) balanced_perf.show() ##compare error for class 6 (difficult minority) class_6_err_imbalanced = imbalanced_perf.confusion_matrix( ).cell_values[5][7] class_6_err_balanced = balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.9 * class_6_err_balanced, "balance_classes makes it at least 10% worse!" if __name__ == "__main__": h2o.run_test(sys.argv, imbalanced)
df_hex.summary() assert (not df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (not df_hex['h3'].isfactor()) df_hex['h1'] = df_hex['h1'].asfactor() df_hex['h2'] = df_hex['h2'].asfactor() df_hex['h3'] = df_hex['h3'].asfactor() df_hex.show() df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor()) df_hex['h1'] = df_hex['h1'].asnumeric() df_hex['h2'] = df_hex['h2'].asnumeric() df_hex['h3'] = df_hex['h3'].asnumeric() df_hex.show() df_hex.summary() assert (not df_hex['h1'].isfactor()) assert (not df_hex['h2'].isfactor()) assert (not df_hex['h3'].isfactor()) if __name__ == "__main__": h2o.run_test(sys.argv, continuous_or_categorical)
dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0]) assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: rf = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # # 4. fold_column and fold_assignment both specified # try: # rf = h2o.deeplearning(y=cars[response_col], x=cars[predictors], fold_assignment="Random", # fold_column="fold_assignments", training_frame=cars) # assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" # except EnvironmentError: # assert True if __name__ == "__main__": h2o.run_test(sys.argv, cv_carsDL)
covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) covtype[54] = covtype[54].asfactor() hh_imbalanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=False, nfolds=3, distribution="multinomial") hh_imbalanced_perf = hh_imbalanced.model_performance(covtype) hh_imbalanced_perf.show() hh_balanced = h2o.gbm(x=covtype[0:54], y=covtype[54], ntrees=10, balance_classes=True, seed=123, nfolds=3, distribution="multinomial") hh_balanced_perf = hh_balanced.model_performance(covtype) hh_balanced_perf.show() #compare error for class 6 (difficult minority) class_6_err_imbalanced = hh_imbalanced_perf.confusion_matrix().cell_values[5][7] class_6_err_balanced = hh_balanced_perf.confusion_matrix().cell_values[5][7] print("--------------------") print("") print("class_6_err_imbalanced") print(class_6_err_imbalanced) print("") print("class_6_err_balanced") print(class_6_err_balanced) print("") print("--------------------") assert class_6_err_imbalanced >= 0.90*class_6_err_balanced, "balance_classes makes it at least 10% worse!" if __name__ == "__main__": h2o.run_test(sys.argv, imbalancedGBM)
family="binomial", link="logit") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("POISSON: ") h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson") h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("GAMMA: ") h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma") h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse") assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" if __name__ == "__main__": h2o.run_test(sys.argv, link_correct_default)
import numpy as np def wide_dataset_large(ip, port): print("Reading in Arcene training data for binomial modeling.") trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=" ") trainDataResponse = np.where(trainDataResponse == -1, 0, 1) trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=" ") trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist()) print("Run model on 3250 columns of Arcene with strong rules off.") model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1]) print("Test model on validation set.") validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=" ") validDataResponse = np.where(validDataResponse == -1, 0, 1) validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=" ") validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist()) prediction = model.predict(validData) print("Check performance of predictions.") performance = model.model_performance(validData) print("Check that prediction AUC better than guessing (0.5).") assert performance.auc() > 0.5, "predictions should be better then pure chance" if __name__ == "__main__": h2o.run_test(sys.argv, wide_dataset_large)
assert False, "Expected model-build to fail when nfolds is 1 or < 0" except EnvironmentError: assert True # 2. more folds than observations try: glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, family=family, fold_assignment="Modulo") assert False, "Expected model-build to fail when nfolds > nobs" except EnvironmentError: assert True # 3. fold_column and nfolds both specified try: glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", family=family, training_frame=cars) assert False, "Expected model-build to fail when fold_column and nfolds both specified" except EnvironmentError: assert True # # 4. fold_column and fold_assignment both specified # try: # glm = h2o.glm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", # family=family, training_frame=cars) # assert False, "Expected model-build to fail when fold_column and fold_assignment both specified" # except EnvironmentError: # assert True if __name__ == "__main__": h2o.run_test(sys.argv, cv_carsGLM)
def link_functions_binomial(ip,port): # Connect to h2o h2o.init(ip,port) print("Read in prostate data.") h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:,2] sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures" if __name__ == "__main__": h2o.run_test(sys.argv, link_functions_binomial)
dataset2_python_weighted = copy.deepcopy(dataset2_python) [r.append(0.8) for r in dataset2_python_weighted] ##### combine dataset1 and dataset2 combined_dataset_python = [] [combined_dataset_python.append(r) for r in dataset1_python_weighted] [combined_dataset_python.append(r) for r in dataset2_python_weighted] combined_dataset_h2o = h2o.H2OFrame(python_obj=combined_dataset_python) combined_dataset_h2o.setNames(["response", "p1", "p2", "p3", "weights"]) ##### recompute the variable importances. the relative order should be the same as above. model_combined_dataset = h2o.deeplearning( x=combined_dataset_h2o[["p1", "p2", "p3"]], y=combined_dataset_h2o["response"], training_frame=combined_dataset_h2o, variable_importances=True, weights_column="weights", hidden=[1], reproducible=True, seed=1234, activation="Tanh") varimp_combined = tuple( [p[0] for p in model_combined_dataset.varimp(return_list=True)]) assert varimp_combined == ('p3', 'p1', 'p2'), "Expected the following relative variable importance on the combined " \ "dataset: ('p3', 'p1', 'p2'), but got: {0}".format(varimp_combined) if __name__ == "__main__": h2o.run_test(sys.argv, weights_vi)
assert False, "expected error. objects of different dimensions not supported." except EnvironmentError: pass #vec/vec res = iris[0] * iris[1] res.show() assert abs(sum([res[i].eager() for i in range(rows)]) - 2670.98) < 1e-2, "expected different column sum" res = iris[0] * iris[1] * iris[2] * iris[3] res.show() assert abs(sum([res[i].eager() for i in range(rows)]) - 16560.42) < 1e-2, "expected different sum" # frame/frame res = iris * iris res_rows, res_cols = res.dim() assert res_rows == rows and res_cols == cols, "dimension mismatch" res = iris[0:2] * iris[1:3] res_rows, res_cols = res.dim() assert res_rows == rows and res_cols == 2, "dimension mismatch" try: res = iris * iris[0:3] assert False, "expected error. frames are different dimensions." except EnvironmentError: pass if __name__ == "__main__": h2o.run_test(sys.argv, binop_star)
import sys sys.path.insert(1, "../../../") import h2o def cupMediumGBM(ip, port): # Connect to h2o h2o.init(ip, port) train = h2o.import_frame( path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv")) test = h2o.import_frame( path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv")) train["TARGET_B"] = train["TARGET_B"].asfactor() # Train H2O GBM Model: train_cols = train.names() for c in ['', "TARGET_D", "TARGET_B", "CONTROLN"]: train_cols.remove(c) model = h2o.gbm(x=train[train_cols], y=train["TARGET_B"], distribution="bernoulli", ntrees=5) if __name__ == "__main__": h2o.run_test(sys.argv, cupMediumGBM)
import sys sys.path.insert(1, "../../") import h2o def https_import(ip,port): # Connect to h2o h2o.init(ip,port) url = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip" aa = h2o.import_frame(path=url) aa.show() if __name__ == "__main__": h2o.run_test(sys.argv, https_import)
s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter, scenter): assert (hpoint - spoint) < 1e-10, "expected centers to be the same" if __name__ == "__main__": h2o.run_test(sys.argv, iris_h2o_vs_sciKmeans)
sys.path.insert(1, "../../") import h2o def hit_ratio_test(ip,port): air_train = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip")) air_valid = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) air_test = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip")) gbm_mult = h2o.gbm(x=air_train[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth","fMonth"]], y=air_train["fDayOfWeek"].asfactor(), validation_x=air_valid[["Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed", "fDayofMonth", "fMonth"]], validation_y=air_valid["fDayOfWeek"].asfactor(), distribution="multinomial") training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True) training_hit_ratio_table.show() validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True) validation_hit_ratio_table.show() perf = gbm_mult.model_performance(air_test) test_hit_ratio_table = perf.hit_ratio_table() test_hit_ratio_table.show() if __name__ == "__main__": h2o.run_test(sys.argv, hit_ratio_test)
print print "======================================================================" print "============================== Binomial ==============================" print "======================================================================" for i in range(10): attack(pros_train, pros_valid, random.sample([2, 3, 4, 5, 6, 7, 8], random.randint(1, 7)), 1) print print "======================================================================" print "============================== Gaussian ==============================" print "======================================================================" for i in range(10): attack(cars_train, cars_valid, random.sample([2, 3, 4, 5, 6, 7], random.randint(1, 6)), 1) print print "======================================================================" print "============================= Multinomial ============================" print "======================================================================" cars_train[2] = cars_train[2].asfactor() cars_valid[2] = cars_valid[2].asfactor() for i in range(10): attack(cars_train, cars_valid, random.sample([1, 3, 4, 5, 6, 7], random.randint(1, 6)), 2) if __name__ == "__main__": h2o.run_test(sys.argv, random_attack)
# Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) # myY = 54 myX = [x for x in range(0,54) if x not in [20,28]] # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() if __name__ == "__main__": h2o.run_test(sys.argv, covtype)
sys.path.insert(1, "../../../") import h2o def pca_prostate(ip, port): h2o.init(ip, port) print "Importing prostate.csv data...\n" prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power") pred = fitPCA.predict(prostate) print "Projection matrix:\n" print pred.head() if __name__ == "__main__": h2o.run_test(sys.argv, pca_prostate)
h2o_data_zero_weights = h2o.cbind(h2o_data, h2o_zero_weights) h2o_data_zeros_removed = h2o_data[h2o_zero_weights["weights"] == 1] print "Checking that using some zero weights is equivalent to removing those observations:" print check_same(h2o_data_zeros_removed, h2o_data_zero_weights) # doubled weights same as doubled observations doubled_weights = [[1] if random.randint(0,1) else [2] for r in range(100)] h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights) h2o_doubled_weights.setNames(["weights"]) h2o_data_doubled_weights = h2o.cbind(h2o_data, h2o_doubled_weights) doubled_data = copy.deepcopy(data) for d, w in zip(data,doubled_weights): if w[0] == 2: doubled_data.append(d) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) print "Checking that doubling some weights is equivalent to doubling those observations:" print check_same(h2o_data_doubled, h2o_data_doubled_weights) # TODO: random weights # TODO: all zero weights??? # TODO: negative weights??? if __name__ == "__main__": h2o.run_test(sys.argv, weights_check)
import sys sys.path.insert(1, "../../") import h2o def vec_show(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() ################################################################### res = 2 - iris res2 = res[0] print "res2:" res2.show() res3 = res[1] print "res3:" res3.show() iris[2].show() if __name__ == "__main__": h2o.run_test(sys.argv, vec_show)
import sys sys.path.insert(1, "../../") import h2o def show_jira(ip, port): h2o.init(ip, port) local_data = [[1, 'a'],[0, 'b']] h2o_data = h2o.H2OFrame(python_obj=local_data) h2o_data.setNames(['response', 'predictor']) h2o_data.show() if __name__ == "__main__": h2o.run_test(sys.argv, show_jira)
import sys sys.path.insert(1, "../../../") import h2o def demo_gbm(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # Execute gbm demo h2o.demo(func="gbm", interactive=False, test=True) if __name__ == "__main__": h2o.run_test(sys.argv, demo_gbm)
[8,19,61,20.1], [16,256,69,9.7], [11,290,66,9.2], [14,274,68,10.9]] for i in random.sample(range(0,ncent-1), nempty): initial_centers[i] = [100*i for z in range(1,len(initial_centers[0])+1)] initial_centers_h2o = h2o.H2OFrame(initial_centers) initial_centers_h2o_key = initial_centers_h2o.send_frame() initial_centers_sci = np.asarray(initial_centers) #Log.info("Initial cluster centers:") print "H2O initial centers:" initial_centers_h2o.show() print "scikit initial centers:" print initial_centers_sci # H2O can handle empty clusters and so can scikit #Log.info("Check that H2O can handle badly initialized centers") km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1) km_sci.fit(preprocessing.scale(ozone_sci)) print "scikit final centers" print km_sci.cluster_centers_ km_h2o = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=initial_centers_h2o_key, standardize=True) print "H2O final centers" print km_h2o.centers() if __name__ == "__main__": h2o.run_test(sys.argv, emptyclusKmeans)
import h2o import random def get_set_list_timezones(ip, port): # Connect to h2o h2o.init(ip, port) origTZ = h2o.get_timezone() print "Original timezone: {0}".format(origTZ) timezones = h2o.list_timezones() # don't use the first one..it's a header for the table print "timezones[0]:", timezones[0] zone = timezones[random.randint(1, timezones.nrow() - 1), 0].split(" ")[1].split(",")[0] print "Setting the timezone: {0}".format(zone) h2o.set_timezone(zone) newTZ = h2o.get_timezone() assert newTZ == zone, "Expected new timezone to be {0}, but got {01}".format( zone, newTZ) print "Setting the timezone back to original: {0}".format(origTZ) h2o.set_timezone(origTZ) if __name__ == "__main__": h2o.run_test(sys.argv, get_set_list_timezones)
# # 85,0,75,1,1,1,9.9,15.4,7 # # 86,1,75,1,3,1,3.7,0,6 # pros = prostate[[1,2,3],83:86] # assert pros[0,0] == 0, "Incorrect slicing result" # assert pros[1,0] == 75, "Incorrect slicing result" # assert pros[2,0] == 1, "Incorrect slicing result" # assert pros[0,1] == 0, "Incorrect slicing result" # assert pros[1,1] == 75, "Incorrect slicing result" # assert pros[2,1] == 1, "Incorrect slicing result" # assert pros[0,2] == 1, "Incorrect slicing result" # assert pros[1,2] == 75, "Incorrect slicing result" # assert pros[2,2] == 1, "Incorrect slicing result" # # # prostate [list,list] case # # 27,0,67,1,2,1,2.8,25.6,7 # # 9,0,69,1,1,1,3.9,24,7 # # 201,0,57,1,1,1,10.2,0,6 # pros = prostate[[5,6,7],[26,8,200]] # assert pros[0,0] == 1, "Incorrect slicing result" # assert (pros[1,0]-3.9) < 1e-10, "Incorrect slicing result" # assert pros[2,0] == 24, "Incorrect slicing result" # assert pros[0,1] == 1, "Incorrect slicing result" # assert (pros[1,1]-2.8) < 1e-10, "Incorrect slicing result" # assert (pros[2,1]-25.6) < 1e-10, "Incorrect slicing result" # assert pros[0,2] == 1, "Incorrect slicing result" # assert (pros[1,2]-10.2) < 1e-10, "Incorrect slicing result" # assert pros[2,2] == 0, "Incorrect slicing result" if __name__ == "__main__": h2o.run_test(sys.argv, multi_dim_slicing)
assert check_values(h2o.sign(h2o_data2), np.sign(np_data2)), "expected equal sign values between h2o and numpy" assert check_values(h2o.sqrt(h2o_data3), np.sqrt(np_data3)), "expected equal sqrt values between h2o and numpy" assert check_values(h2o.trunc(h2o_data3), np.trunc(np_data3)), "expected equal trunc values between h2o and numpy" assert check_values(h2o.ceil(h2o_data3), np.ceil(np_data3)), "expected equal ceil values between h2o and numpy" assert check_values(h2o.floor(h2o_data3), np.floor(np_data3)), "expected equal floor values between h2o and numpy" assert check_values(h2o.log(h2o_data3), np.log(np_data3)), "expected equal log values between h2o and numpy" assert check_values(h2o.log10(h2o_data3), np.log10(np_data3)), "expected equal log10 values between h2o and numpy" assert check_values(h2o.log1p(h2o_data3), np.log1p(np_data3)), "expected equal log1p values between h2o and numpy" assert check_values(h2o.log2(h2o_data3), np.log2(np_data3)), "expected equal log2 values between h2o and numpy" assert check_values(h2o.exp(h2o_data3), np.exp(np_data3)), "expected equal exp values between h2o and numpy" assert check_values(h2o.expm1(h2o_data3), np.expm1(np_data3)), "expected equal expm1 values between h2o and numpy" h2o_val = h2o.as_list(h2o.gamma(h2o_data3))[5][5] num_val = math.gamma(h2o.as_list(h2o_data3)[5][5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and math".format(h2o_val,num_val) h2o_val = h2o.as_list(h2o.lgamma(h2o_data3))[5][5] num_val = math.lgamma(h2o.as_list(h2o_data3)[5][5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and math".format(h2o_val,num_val) h2o_val = h2o.as_list(h2o.digamma(h2o_data3))[5][5] num_val = scipy.special.polygamma(0,h2o.as_list(h2o_data3)[5][5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and math".format(h2o_val,num_val) h2o_val = h2o.as_list(h2o.trigamma(h2o_data3))[5][5] num_val = scipy.special.polygamma(1,h2o.as_list(h2o_data3)[5][5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and math".format(h2o_val,num_val) if __name__ == "__main__": h2o.run_test(sys.argv, expr_math_ops)
import sys sys.path.insert(1, "../../") import h2o def col_names_check(ip,port): iris_wheader = h2o.import_file(h2o.locate("smalldata/iris/iris_wheader.csv")) assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \ "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"], iris_wheader.col_names) iris = h2o.import_file(h2o.locate("smalldata/iris/iris.csv")) assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \ "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names) if __name__ == "__main__": h2o.run_test(sys.argv, col_names_check)
res = iris[0] == 4.7 res_rows = len(res) assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow() assert new_rows == 2, "wrong number of rows returned" res = 3.5 == iris[1] res_rows = len(res) assert res_rows == rows, "dimension mismatch" new_rows = iris[res].nrow() assert new_rows == 6, "wrong number of rows returned" # frame/frame res = iris == iris res_rows, res_cols = res.dim() assert res_rows == rows and res_cols == cols, "dimension mismatch" res = iris[0:2] == iris[1:3] res_rows, res_cols = res.dim() assert res_rows == rows and res_cols == 2, "dimension mismatch" try: res = iris == iris[0:3] assert False, "expected error. frames are different dimensions." except EnvironmentError: pass if __name__ == "__main__": h2o.run_test(sys.argv, binop_eq)