def test_anovaglm_serialization(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = 'CAPSULE' x = ['AGE', 'VOL', 'DCAPS'] train[y] = train[y].asfactor() anovaglm_model = anovaglm(family='binomial', lambda_=0, missing_values_handling="skip") anovaglm_model.train(x=x, y=y, training_frame=train) tmpdir = tempfile.mkdtemp() model_path = anovaglm_model.download_model(tmpdir) result_frame_filename = os.path.join(tmpdir, "result_frame.csv") h2o.download_csv(anovaglm_model.result(), result_frame_filename) h2o.remove_all() result_frame_original = h2o.import_file(result_frame_filename) loaded_anovaglm_model = h2o.load_model(model_path) result_frame_loaded = loaded_anovaglm_model.result() for cind in list(range(0, result_frame_original.ncols)): for rind in list(range(0, result_frame_original.nrows)): if result_frame_original.type(cind) == 'real': assert abs(result_frame_original[rind, cind]-result_frame_loaded[rind, cind]) < 1e-6, \ "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind]) else: assert result_frame_original[rind, cind]==result_frame_loaded[rind, cind], \ "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="gaussian" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001) # generate random dataset dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not(cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count+1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def glm_fractional_binomial_mojo_pojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) x = ["log10conc"] y = "y" glmModel = pyunit_utils.build_save_model_GLM( params, x, train, y) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME) pred_h2o = pred_h2o.drop(3) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6) # print("Comparing pojo predict and h2o predict...") # pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str( type(ex) ): # only care if there is an AssertionError, ignore the others sys.exit(1)
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() glmBinomialModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "glm", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x): # set deeplearning model parameters params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) if auto_encoder: try: # build and save mojo model deeplearning_model = build_save_model(params, x, train) except Exception as err: if not("Trying to predict with an unstable model" in err.args[0]): raise Exception('Deeplearning autoencoder model failed to build. Fix it.') return else: # build and save mojo model deeplearning_model = build_save_model(params, x, train) # save test file, h2o predict/mojo use same file h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # load model and perform predict pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME) pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME) # save model for debugging h2o.save_model(deeplearning_model, path=TMPDIR, force=True) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def test_gam_transformed_frame_serialization(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gam_frame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) tmpdir = tempfile.mkdtemp() filename = os.path.join(tmpdir, "gamXFrame.csv") h2o.download_csv(gam_frame, filename) model_path = h2o.save_model(h2o_model, tmpdir) h2o.remove_all() loaded_model = h2o.load_model(model_path) gam_frame_loaded = h2o.get_frame( loaded_model._model_json["output"]["gam_transformed_center_key"]) gam_frame_original = h2o.import_file(filename) pyunit_utils.compare_frames_local(gam_frame_loaded[2:15], gam_frame_original[2:15], prob=1, tol=1e-6) print("Test completed.")
def runComparisonTests(autoEncoder, actFun, missingValuesHandling, setAllFactor, train, test, x): params = set_params(actFun, missingValuesHandling, setAllFactor, autoEncoder) # set deeplearning model parameters if autoEncoder: try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model except Exception as err: if not ("Trying to predict with an unstable model" in err.args[0]): raise Exception( 'Deeplearning autoencoder model failed to build. Fix it.') return else: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) append_response = False distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','tweedie','gamma'], 1)[0] if distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100) else: dataset_params['has_response'] = False response = h2o.H2OFrame.fromPython([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print "Distribution: {0}".format(distribution) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() train = train.impute("response", method="mode") print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(range(1,21),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(range(1,11),1)[0] if random.randint(0,1): params['nbins'] = random.sample(range(2,21),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(range(2,1025),1)[0] if random.randint(0,1): params['learn_rate'] = random.random() params['distribution'] = distribution print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def glm_gamma_offset_mojo(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = "DPROS" x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"] x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"] params = {'family': "gamma", 'offset_column': "C1"} offset = pyunit_utils.random_dataset_real_only(train.nrow, 1, realR=3, misFrac=0, randSeed=12345) train = train.cbind(offset) tmpdir = tempfile.mkdtemp() glm_gamma_model = pyunit_utils.build_save_model_generic( params, x, train, y, "glm", tmpdir) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id) h2o.download_csv(train[x_offset], os.path.join( tmpdir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glm_gamma_model, tmpdir, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10) # compare mojo and model predict
def test_modelselection_backward_serialization(): d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "GLEASON" x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] # make sure duplicate runs produce same results model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward.train(training_frame=d, x=x, y=y) model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward2.train(training_frame=d, x=x, y=y) result = model_backward.result() # get result frame result2 = model_backward.result() # get result frame pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same num_models = result.nrows # number of models built one_model = h2o.get_model(result["model_id"][num_models-1, 0]) predict_frame = one_model.predict(d) tmpdir = tempfile.mkdtemp() file_dir = os.path.join(tmpdir, "predict.csv") h2o.download_csv(predict_frame, file_dir) # save one scoring frame model_path_backward = model_backward.download_model(tmpdir) # store the model h2o.remove_all() d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_backward_model = h2o.load_model(model_path_backward) result_frame_backward = loaded_backward_model.result() model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0]) pred_frame_backward = model_from_frame_backward.predict(d) pred_frame_model = h2o.import_file(file_dir) pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
def gam_binomial_mojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) train["C21"] = train["C21"].asfactor() test["C21"] = test["C21"].asfactor() x = ["C1"] y = "C21" TMPDIR = tempfile.mkdtemp() gamModel = pyunit_utils.build_save_model_generic( params, x, train, y, "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamModel._id) h2o.download_csv(test, os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100, 200)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0, 1): params['max_iterations'] = random.sample(list(range(1, 1000)), 1)[0] if random.randint(0, 1): params['transform'] = random.sample( ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"], 1)[0] realNcol = train.ncol - 1 params['k'] = random.sample(list(range(1, min(realNcol, train.nrow))), 1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def glm_multinomial_mojo_pojo(): PROBLEM = "multinomial" NTESTROWS = 200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM( params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def glrm_mojo(): h2o.remove_all() train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv")) predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv")) predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv")) x = train.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random") glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_"+predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict print("Comparing mojo x Factor and model x Factor for 1 iterations") pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict print("Comparing mojo x Factor and model x Factor for 10 iterations") pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def custom_distribution_mojo_test(): rows = 2000 df = random_dataset('binomial', verbose=False, NTESTROWS=rows) df['response'] = df['response'].asnumeric() train = df[rows:, :] test = df[:rows, :] x = list(set(df.names) - {"response"}) params = { 'ntrees': 10, 'max_depth': 4, 'distribution': "custom", 'custom_distribution_func': custom_distribution_bernoulli() } my_gbm = build_save_model_GBM(params, x, train, "response") mojo_name = getMojoName(my_gbm._id) tmp_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", mojo_name)) h2o.download_csv(test[x], os.path.join( tmp_dir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = mojo_predict( my_gbm, tmp_dir, mojo_name) # load model and perform predict assert compare_frames_local( pred_h2o, pred_mojo, returnResult=True ), "Predictions from model and MOJO model are not the same."
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str(type(ex)): # only care if there is an AssertionError, ignore the others sys.exit(1)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([random.randint(1,1000) for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_even = list(range(0, f1.ncol, 2)) skip_odd = list(range(1, f1.ncol, 2)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0] if family == 'binomial': dataset_params['response_factors'] = 2 elif family == 'gaussian': dataset_params['response_factors'] = 1 else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])]) append_response = True print("Family: {0}".format(family)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if family == 'binomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log")) train = train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['alpha'] = random.random() params['family'] = family if params['family'] == "tweedie": if random.randint(0,1): params['tweedie_variance_power'] = round(random.random()+1,6) params['tweedie_link_power'] = 1 - params['tweedie_variance_power'] print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def javapredict(algo, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) else: raise(ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id)) os.makedirs(tmpdir) h2o.download_pojo(model,path=tmpdir) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv")) print "Setting up for Java POJO" h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv")) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(os.path.join(tmpdir,"in.csv"), 'r+') in_csv = f.read() in_csv = re.sub('\"', '', in_csv) f.seek(0) f.write(in_csv) f.truncate() f.close() subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT) subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT) predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv")) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if algo == "gbm": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif algo == "random_forest": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise(ValueError, "algo {0} is not supported".format(algo))
def javapredict_dynamic_data(): dataset_params = {} dataset_params['rows'] = 13183 dataset_params['cols'] = 13 dataset_params['categorical_fraction'] = 0.4 dataset_params['integer_fraction'] = 0.3 dataset_params['missing_fraction'] = 0.27539154084819495 dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print( "Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}" .format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = 16 train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv( train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv( train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params['nbins'] = 5 params['min_rows'] = 7 params['mtries'] = 4 params['sample_rate'] = 0.7867986759373544 params['seed'] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) problem = random.sample(list(range(0,3)),1) print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == 'binomial': dataset_params['response_factors'] = 2 elif problem == 'regression': dataset_params['response_factors'] = 1 else: dataset_params['response_factors'] = random.randint(3,100) train = h2o.create_frame(**dataset_params) if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0] if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0] if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0] if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0] if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0] if random.randint(0,1): params['sample_rate'] = random.random() print("Parameter list: {0}".format(params)) x = list(range(1,train.ncol)) y = "response" pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def impute_data(method = "mean", to_impute = to_impute, predictors = predictors): if method == "mean": print "Mean imputing missing data for predictors:", to_impute # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"]) # gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute] gm = d["time_period"].unique() print "Finding means..." for predictor in to_impute: gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0)) gm.show() print "Saving the imputation means to disk..." h2o.download_csv(gm, filename = saving_means_fp) # df_py = h2o.as_list(gm) # Now that's stored for the holdout data, do this a faster way in java for the training data: for predictor in to_impute: d.impute(predictor, method='mean', by = ['time_period'], inplace = True) print "Done imputing", predictor print "Saving the final mean imputed data to disk..." h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True) if method == "model": # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses. newdata = d # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data. for predictor in to_impute: print "Building model for imputing " + predictor print "Subsetting the data into missing values for predictor and no missing values for predictor" na_ind = d[predictor].isna() not_na_ind = na_ind != 1.0 to_train = d[not_na_ind] to_predict = d[na_ind] these_var = [var for var in predictors if var != predictor] trained = h2o.gbm(x = to_train[these_var], y = to_train[[predictor]], ntrees=300, max_depth=6, learn_rate=0.2) print "Saving the imputation tree model for " + predictor h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor) print "Imputing the missing " + predictor + " data by predicting with the model..." predicted = trained.predict(to_predict[these_var]) tofillin = newdata[predictor] assert len(predicted) == len(tofillin[na_ind]) tofillin[na_ind] = predicted # mutate the column in place newdata[predictor] = tofillin print "Saving the final model-imputed data to disk..." h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0] dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0] dataset_params['categorical_fraction'] = round(random.random(), 1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round( left_over - round(random.uniform(0, left_over), 1), 1) if dataset_params['integer_fraction'] + dataset_params[ 'categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params[ 'categorical_fraction']: dataset_params[ 'integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params[ 'categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0, 0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2, 2000) dataset_params['response_factors'] = random.randint(3, 100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv( train, os.path.join(results_dir, "nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0, 1): params['laplace'] = random.uniform(0, 11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def pubdev_1480(): if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz") test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz") model = h2o.gbm(x=train[list(range(2, 9))], y=train[1]) predictions = model.predict(test) results_dir = pyunit_utils.locate("results") h2o.download_csv(predictions, os.path.join(results_dir, "predictions.csv"))
def pubdev_1480(): if not pyunit_utils.hadoop_namenode_is_accessible(): raise(EnvironmentError, "Not running on H2O internal network. No access to HDFS.") train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz") test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz") model = h2o.gbm(x=train[range(2,9)], y=train[1]) predictions = model.predict(test) results_dir = pyunit_utils.locate("results") h2o.download_csv(predictions, os.path.join(results_dir,"predictions.csv"))
def make_predictions_and_save(classifier, test_data, output_file, columns_offset): if path.exists(output_file) or sdir_exists(output_file): print 'already exists', output_file return if type(classifier) == str: classifier = load_h2o_model(classifier) if type(test_data) == str: test_data = load_h2o_data(test_data) predictions = classifier.predict(test_data[:, columns_offset:]) if '/' not in output_file: output_file = sdir_path(output_file) h2o.download_csv(predictions, output_file)
def pubdev_1480(): if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz") test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz") model = H2OGradientBoostingEstimator() model.train(x=list(range(2, 9)), y=1, training_frame=train) predictions = model.predict(test) results_dir = pyunit_utils.locate("results") h2o.download_csv(predictions, os.path.join(results_dir, "predictions.csv"))
def pubdev_1480(): if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz") test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz") model=H2OGradientBoostingEstimator() model.train(x=list(range(2,9)),y=1,training_frame=train) predictions = model.predict(test) results_dir = pyunit_utils.locate("results") h2o.download_csv(predictions, os.path.join(results_dir,"predictions.csv"))
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1): x = train.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("dataset transform is {0}.".format(transformN)) # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=K, transform=transformN, max_iterations=1000, seed=12345) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) # assert glrmTrainFactor.nrows==train.nrows, \ # "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) mojoDir = save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) h2o.download_csv(test[x], os.path.join( mojoDir, 'in.csv')) # save test file, h2o predict/mojo use same file frameID, mojoXFactor = pyunit_utils.mojo_predict( glrmModel, mojoDir, MOJONAME, glrmReconstruct=False) # save mojo XFactor print("Comparing mojo x Factor and model x Factor ...") if transformN == "NONE" or not ( compare_predict ): # bad performance with no transformation on dataset pyunit_utils.check_data_rows(mojoXFactor, glrmTrainFactor, num_rows=mojoXFactor.nrow) else: pyunit_utils.compare_data_rows(mojoXFactor, glrmTrainFactor, index_list=range( 2, mojoXFactor.nrows - 1), tol=tol) if compare_predict: # only compare reconstructed data frames with numerical data pred2 = glrmModel.predict(test) # predict using mojo pred1 = glrmModel.predict( train) # predict using the X from A=X*Y from training predictDiff = pyunit_utils.compute_frame_diff(train, pred1) mojoDiff = pyunit_utils.compute_frame_diff(train, pred2) print( "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}" .format(mojoDiff, predictDiff))
def download_csv(ip, port): iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.download_csv(iris1, "iris_delete.csv") iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv")) os.remove("iris_delete.csv") rand_row = random.randint(0, iris1.nrow - 1) rand_col = random.randint(0, 3) assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \ "be the same, but got {0} and {1}" \ "".format(iris1[rand_row, rand_col], iris2[rand_row, rand_col])
def load_feature_label_table(save_as_csv=False,femq12=None): """ ultimate source of data is D:\data\PreliminaryAnalysis\BogusDealers\<dta & csv files> see also funcs from Shekhar's files: load_*() in D:\shekhar_code_github\BogusFirmCatching\init_sm.py load_everything() in D:\shekhar_code_github\BogusFirmCatching\ml_funcs.py """ if femq12 is None: femq12 = load_everything() init() ffemq12 = load_h2odataframe_returns(femq12) if save_as_csv: h2o.download_csv(ffemq12,r"Z:\all_returns_features_minus_q12.csv") # TrainData, ValidData, TestData = divide_train_test(ffemq12) return femq12,ffemq12
def javapredict_dynamic_data(): dataset_params = {} dataset_params["rows"] = 13183 dataset_params["cols"] = 13 dataset_params["categorical_fraction"] = 0.4 dataset_params["integer_fraction"] = 0.3 dataset_params["missing_fraction"] = 0.27539154084819495 dataset_params["has_response"] = True dataset_params["randomize"] = True dataset_params["factors"] = 819 print("Dataset parameters: {0}".format(dataset_params)) problem = 2 print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem)) if problem == "binomial": dataset_params["response_factors"] = 2 elif problem == "regression": dataset_params["response_factors"] = 1 else: dataset_params["response_factors"] = 16 train = h2o.create_frame(**dataset_params) if problem == "binomial" or problem == "multinomial": train["response"] = train["response"].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train, os.path.join(results_dir, "drf_dynamic_training_dataset.log")) params = {} params["nbins"] = 5 params["min_rows"] = 7 params["mtries"] = 4 params["sample_rate"] = 0.7867986759373544 params["seed"] = 1304644573760597606 print("Parameter list: {0}".format(params)) x = list(range(1, train.ncol)) y = "response" pyunit_utils.javapredict( algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params )
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_" + predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) # scoring with 2 iterations should be shorter than scoring with 8000 iterations starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=8000) # save mojo predict time1000 = time.time() - starttime starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2) # save mojo predict time10 = time.time() - starttime print( "Time taken for 2 iterations: {0}s. Time taken for 8000 iterations: {1}s." .format(time10, time1000))
def runComparisonTests(autoEncoder, probleyType): params = set_params(autoEncoder) # set deeplearning model parameters df = random_dataset(probleyType) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def drf_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'max_depth': 4} TMPDIR = tempfile.mkdtemp() my_gbm = pyunit_utils.build_save_model_generic(params, x, train, "response", "DRF", TMPDIR) MOJONAME = pyunit_utils.getMojoName(my_gbm._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) os.remove(csv) else: print "Not running on H2O internal network. No access to HDFS."
def gbm_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4} my_gbm = pyunit_utils.build_save_model_GBM(params, x, train, "response") MOJONAME = pyunit_utils.getMojoName(my_gbm._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def download_csv(ip,port): iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.download_csv(iris1,"iris_delete.csv") iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv")) os.remove("iris_delete.csv") rand_row = random.randint(0,iris1.nrow-1) rand_col = random.randint(0,3) assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \ "be the same, but got {0} and {1}" \ "".format(iris1[rand_row, rand_col], iris2[rand_row, rand_col])
def pubdev_1431(): hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = tests.hadoop_namenode() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) os.remove(csv) else: raise(EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(range(5000,15001),1)[0] dataset_params['cols'] = random.sample(range(10,21),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print "Dataset parameters: {0}".format(dataset_params) train = h2o.create_frame(**dataset_params) print "Training dataset:" print train # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"kmeans_dynamic_training_dataset.log")) # Generate random parameters params = {} params['k'] = random.sample(range(1,10),1)[0] if random.randint(0,1): params['max_iterations'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['standardize'] = random.sample([True, False],1)[0] if random.randint(0,1): params['seed'] = random.sample(range(1,1000),1)[0] if random.randint(0,1): params['init'] = random.sample(['Random','PlusPlus','Furthest'],1)[0] print "Parameter list: {0}".format(params) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="kmeans", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(100,200)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.01) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,50) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"pca_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['max_iterations'] = random.sample(list(range(1,1000)),1)[0] if random.randint(0,1): params['transform'] = random.sample(["NONE","STANDARDIZE","NORMALIZE","DEMEAN","DESCALE"],1)[0] params['k'] = random.sample(list(range(1,min(train.ncol,train.nrow))),1)[0] print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) dataset_params['response_factors'] = random.randint(3,100) print("Dataset parameters: {0}".format(dataset_params)) train = h2o.create_frame(**dataset_params) print("Training dataset:") print(train) # Save dataset to results directory results_dir = pyunit_utils.locate("results") h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log")) # Generate random parameters params = {} params['laplace'] = 0 if random.randint(0,1): params['laplace'] = random.uniform(0,11) print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glm_multinomial_mojo_pojo(): PROBLEM="multinomial" NTESTROWS=200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def javapredict_dynamic_data(): # Generate random dataset dataset_params = {} dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0] dataset_params['cols'] = random.sample(list(range(10,21)),1)[0] dataset_params['categorical_fraction'] = round(random.random(),1) left_over = (1 - dataset_params['categorical_fraction']) dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1) if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1: if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']: dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1 else: dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1 dataset_params['missing_fraction'] = random.uniform(0,0.5) dataset_params['has_response'] = True dataset_params['randomize'] = True dataset_params['factors'] = random.randint(2,2000) print("Dataset parameters: {0}".format(dataset_params)) append_response = False distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','gamma'], 1)[0] if distribution == 'bernoulli': dataset_params['response_factors'] = 2 elif distribution == 'gaussian': dataset_params['response_factors'] = 1 elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100) else: dataset_params['has_response'] = False response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])]) append_response = True print("Distribution: {0}".format(distribution)) train = h2o.create_frame(**dataset_params) if append_response: train = response.cbind(train) train.set_name(0,"response") if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor() results_dir = pyunit_utils.locate("results") h2o.download_csv(train["response"],os.path.join(results_dir,"dl_dynamic_preimputed_response.log")) train.impute("response", method="mode") print("Training dataset:") print(train) # Save dataset to results directory h2o.download_csv(train,os.path.join(results_dir,"dl_dynamic_training_dataset.log")) # Generate random parameters params = {} if random.randint(0,1): params['activation'] = random.sample(["Rectifier", "Tanh", "TanhWithDropout", "RectifierWithDropout", "MaxoutWithDropout"],1)[0] if random.randint(0,1): params['epochs'] = random.sample(list(range(1,10)),1)[0] if random.randint(0,1): h = random.randint(10,21) params['hidden'] = [h for x in range(random.randint(2,3))] params['distribution'] = distribution params['l1'] = random.random() print("Parameter list: {0}".format(params)) x = train.names x.remove("response") y = "response" pyunit_utils.javapredict(algo="deeplearning", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise(ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id)) os.mkdir(tmpdir) h2o.download_pojo(model,path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar") assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir,model._id+".java") assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir,"in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if equality == "numeric": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif equality == "class": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise(ValueError, "equality type {0} is not supported".format(equality))
def gen_data(): floatA = [] intA = [] sizeMat = range(0,30) # use to generate data of values 0, +/- 2^0 to +/1 2^64 lowBoundF = -100000 upperBoundF = -1*lowBoundF # 2 million rows upperBoundL = pow(2,35) lowBoundL = upperBoundL-100000 numZeros = 0 numNans = 0 # generate Nans numInfs = 500 numRep = 2 # number of times to repeat array csvFile = "/Users/wendycwong/temp/TopBottomNRep4.csv" fMult = 1.1 fintA = [] ffloatA = [] for ind in range(0,1000): floatA = [] intA = [] genRandomData(intA,floatA, sizeMat) fintA.extend(intA) ffloatA.extend(floatA) shuffle(fintA) shuffle(ffloatA) bottom20FrameL = h2o.H2OFrame(python_obj=zip(fintA)) bottom20FrameF = h2o.H2OFrame(python_obj=zip(ffloatA)) h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/smallIntFloats.csv" ) genStaticData(intA, floatA, upperBoundL, lowBoundF, upperBoundF, fMult) # save the correct sequence before shuffling for comparison purpose tempL = intA[0:int(round(len(intA)*0.2))] # comes in decreasing value tempF = floatA[0:int(round(len(floatA)*0.2))] # comes in decreasing value # save the correct sequence before shuffling for comparison purpose bottom20FrameL = h2o.H2OFrame(python_obj=zip(tempL)) bottom20FrameF = h2o.H2OFrame(python_obj=zip(tempF)) h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/Bottom20Per.csv" ) tempL = intA[int(round(len(intA)*0.8)):len(intA)] tempL.sort() tempF = floatA[int(round(len(floatA)*0.8)):len(floatA)] tempF.sort() bottom20FrameL = h2o.H2OFrame(python_obj=zip(tempL)) bottom20FrameF = h2o.H2OFrame(python_obj=zip(tempF)) h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/Top20Per.csv" ) # repeat the columns a few times to seriously test the algo with duplicated data. for val in range(0, numRep): intA.extend(intA) floatA.extend(floatA) shuffle(intA) # randomly shuffle the indices shuffle(floatA) # intFrame = h2o.H2OFrame(python_obj=zip(intA)) floatFrame = h2o.H2OFrame(python_obj=zip(floatA)) h2o.download_csv(intFrame.cbind(floatFrame), csvFile)
def download(_self, filename): h2o_frame = utils.Utils.dataframe_2_h2oframe(_self) h2o.download_csv(h2o_frame, filename)
def test_mojo_model(target_dir): """ Test the correctness of the "MOJO" model format. This test will create a random dataset, split into training/testing part, train a DRF model on it, download the model's MOJO, score the model remotely and fetch the predictions, score the model locally by running the genmodel jar, and finally compare the prediction results. """ genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar") assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar report = [] for estimator, estimator_name in [(H2ODeepWaterEstimator, "DeepWater"), (H2ORandomForestEstimator, "DRF"), (H2OGradientBoostingEstimator, "GBM")]: if (estimator == H2ODeepWaterEstimator and not H2ODeepWaterEstimator.available()): continue print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================") print("# Estimator: " + estimator.__name__) print("#================================================\n" + colorama.Fore.RESET) for problem in ["binomial", "multinomial", "regression"]: print("========================") print("%s problem" % problem.capitalize()) print("========================") if estimator == H2ODeepWaterEstimator and problem == "regression": print("Skipping %s" % problem.capitalize) continue df = random_dataset(problem, verbose=False) print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol)) train = df[NTESTROWS:, :] test0 = df[0, :] test1 = df[:NTESTROWS, :] test2 = test1.rbind(test1) time0 = time.time() print("\n\nTraining %s model..." % estimator.__name__) if estimator == H2ODeepWaterEstimator: model = estimator(epochs=EPOCHS) # , categorical_encoding="enum") else: model = estimator(ntrees=NTREES, max_depth=DEPTH) model.train(training_frame=train) print(model.summary()) print(" Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading MOJO...") time0 = time.time() mojo_file = model.download_mojo(target_dir) print(" => %s (%d bytes)" % (mojo_file, os.stat(mojo_file).st_size)) assert os.path.exists(mojo_file) print(" Time taken = %.3fs" % (time.time() - time0)) if estimator != H2ODeepWaterEstimator: print("\nDownloading POJO...") time0 = time.time() pojo_file = model.download_pojo(target_dir) pojo_size = os.stat(pojo_file).st_size pojo_name = os.path.splitext(os.path.basename(pojo_file))[0] print(" => %s (%d bytes)" % (pojo_file, pojo_size)) print(" Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading the test datasets for local use: ") time0 = time.time() test0_file = os.path.join(target_dir, "test0_%s.csv" % test0.frame_id) test1_file = os.path.join(target_dir, "test1_%s.csv" % test1.frame_id) test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id) print(" => " + test0_file) print(" => " + test1_file) print(" => " + test2_file) h2o.download_csv(test0, test0_file) h2o.download_csv(test1, test1_file) h2o.download_csv(test2, test2_file) print(" Time taken = %.3fs" % (time.time() - time0)) print("\nScoring the model remotely and downloading to files...") times = [] h2o_pred_file0 = os.path.join(target_dir, "predR_%s.csv" % test0.frame_id) h2o_pred_file1 = os.path.join(target_dir, "predR_%s.csv" % test1.frame_id) h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id) for testframe, outfile in [(test0, h2o_pred_file0), (test1, h2o_pred_file1), (test2, h2o_pred_file2)]: predictions = model.predict(testframe) h2o.download_csv(predictions, outfile) print(" => " + outfile) times.append(time.time()) print(" Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1])) print("\nScoring the model locally and saving to files... ") times = [] local_pred_file0 = os.path.join(target_dir, "predL_%s.csv" % test0.frame_id) local_pred_file1 = os.path.join(target_dir, "predL_%s.csv" % test1.frame_id) local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id) for inpfile, outfile in [(test0_file, local_pred_file0), (test1_file, local_pred_file1), (test2_file, local_pred_file2)]: load_csv(inpfile) java_cmd = ["java", "-cp", genmodel_jar, "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--input", inpfile, "--output", outfile, "--mojo", mojo_file, "--decimal"] print(" %r" % java_cmd) ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d" % ret print(" => " + local_pred_file1) times.append(time.time()) print(" Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Mojo", times[1] - times[0], times[2] - times[1])) if estimator != H2ODeepWaterEstimator and pojo_size <= 1000 << 20: # 1000 Mb time0 = time.time() print("\nCompiling Java Pojo") javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file] subprocess.check_call(javac_cmd) print(" Time taken = %.3fs" % (time.time() - time0)) pojo_pred_file0 = os.path.join(target_dir, "predP_%s.csv" % test0.frame_id) pojo_pred_file1 = os.path.join(target_dir, "predP_%s.csv" % test1.frame_id) pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id) print("\nScoring POJO and saving to file...") times = [] cp_sep = ";" if sys.platform == "win32" else ":" for inpfile, outfile in [(test0_file, pojo_pred_file0), (test1_file, pojo_pred_file1), (test2_file, pojo_pred_file2)]: load_csv(inpfile) java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]), "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m", "hex.genmodel.tools.PredictCsv", "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"] print(" %r" % java_cmd) ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d" % ret times.append(time.time()) print(" Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1])) else: pojo_pred_file1 = None print("\nChecking whether the predictions coincide...") time0 = time.time() local_pred = load_csv(local_pred_file1) server_pred = load_csv(h2o_pred_file1) pojo_pred = load_csv(pojo_pred_file1) if pojo_pred_file1 else local_pred assert len(local_pred) == len(server_pred) == len(pojo_pred) == test1.nrow, \ "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \ (len(local_pred), len(server_pred), len(pojo_pred), test1.nrow) for i in range(test1.nrow): lpred = local_pred[i] rpred = server_pred[i] ppred = pojo_pred[i] assert type(lpred) == type(rpred) == type(ppred), \ "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred) if isinstance(lpred, float): same = abs(lpred - rpred) + abs(lpred - ppred) <= 1e-8 * (abs(lpred) + abs(rpred) + abs(ppred)) else: same = lpred == rpred == ppred assert same, \ "Predictions are different for row %d: mojo=%r, pojo=%r, server=%r" % (i + 1, lpred, ppred, rpred) print(" Time taken = %.3fs" % (time.time() - time0)) print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET) print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================") print("# Timing report") print("#================================================\n" + colorama.Fore.RESET) print(tabulate.tabulate(report, headers=["Model", "Problem type", "Scorer", "%d rows" % NTESTROWS, "%d rows" % (2 * NTESTROWS)], floatfmt=".3f"), end="\n\n\n")
def javamunge(assembly, pojoname, test, compile_only=False): """ Here's how to use: assembly is an already fit H2OAssembly; The test set should be used to compare the output here and the output of the POJO. """ print("Downloading munging POJO code from H2O") tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results", pojoname)) os.mkdir(tmpdir) assembly.to_pojo(pojoname, path=tmpdir, get_jar=True) h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar") assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)) java_file = os.path.join(tmpdir,pojoname+".java") assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file) print("java code saved in {0}".format(java_file)) print("Compiling Java Pojo") javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file] subprocess.check_call(javac_cmd) if not compile_only: print("Setting up for Java POJO") in_csv = os.path.join(tmpdir,"in.csv") h2o.download_csv(test, in_csv) assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print("Input CSV to mungedCSV saved in {0}".format(in_csv)) print("Predicting in H2O") munged = assembly.fit(test) munged.head() out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv") h2o.download_csv(munged, out_h2o_csv) assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print("Munged frame saved in {0}".format(out_h2o_csv)) print("Running PredictCsv Java Program") out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.MungeCsv", "--header", "--munger", pojoname, "--input", in_csv, "--output", out_pojo_csv] print("JAVA COMMAND: " + " ".join(java_cmd)) p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print("Java output: {0}".format(o)) assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) munged2 = h2o.upload_file(path=out_pojo_csv) print("Pojo predictions saved in {0}".format(out_pojo_csv)) print("Comparing predictions between H2O and Java POJO") # Dimensions hr, hc = munged.dim pr, pc = munged2.dim assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc) # Value import math munged.show() munged2.show() for r in range(hr): for c in range(hc): hp = munged[r,c] pp = munged2[r,c] if isinstance(hp, float): assert isinstance(pp, float) assert (math.fabs(hp-pp) < 1e-8) or (math.isnan(hp) and math.isnan(pp)), "Expected munged rows to be the same for row {0}, but got {1}, and {2}".format(r, hp, pp) else: assert hp == pp, "Expected munged rows to be the same for row {0}, but got {1}, and {2}".format(r, hp, pp)
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs): print("Creating model in H2O") if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs) elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs) elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs) elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs) elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs) elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs) elif algo == "pca": model = H2OPCA(**kwargs) else: raise ValueError if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train) else: model.train(x=x, y=y, training_frame=train) print(model) # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means. # TODO: clients should extract Java class name from header. regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]") pojoname = regex.sub("_",model._id) print("Downloading Java prediction model code from H2O") tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",pojoname)) os.mkdir(tmpdir) h2o.download_pojo(model,path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar") assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)) java_file = os.path.join(tmpdir,pojoname+".java") assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file) print("java code saved in {0}".format(java_file)) print("Compiling Java Pojo") javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file] subprocess.check_call(javac_cmd) if not compile_only: print("Predicting in H2O") predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print("H2O Predictions saved in {0}".format(out_h2o_csv)) print("Setting up for Java POJO") in_csv = os.path.join(tmpdir,"in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print("Input CSV to PredictCsv saved in {0}".format(in_csv)) print("Running PredictCsv Java Program") out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname, "--input", in_csv, "--output", out_pojo_csv] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print("Java output: {0}".format(o)) assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.upload_file(path=out_pojo_csv) print("Pojo predictions saved in {0}".format(out_pojo_csv)) print("Comparing predictions between H2O and Java POJO") # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if equality == "numeric": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif equality == "class": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise ValueError