def gbm_on_hive(): connection_url = "jdbc:hive2://localhost:10000/default" krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' use_token = os.getenv('KRB_USE_TOKEN', 'false').lower() == 'true' if krb_enabled: if use_token: connection_url += ";auth=delegationToken" else: connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') select_query = "select * from airlinestest" username = "******" password = "" # read from S3 airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip") # read from Hive Streaming airlines_dataset_streaming = h2o.import_sql_select(connection_url, select_query, username, password, fetch_mode="SINGLE") airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming) # datasets should be identical from user's point of view pyunit_utils.compare_frames_local(airlines_dataset_original, airlines_dataset_streaming, 1) airlines_X_col_names = airlines_dataset_streaming.col_names[:-2] airlines_y_col_name = airlines_dataset_streaming.col_names[-2] gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000) gbm_v1.train(airlines_X_col_names, airlines_y_col_name, training_frame=airlines_dataset_streaming, validation_frame=airlines_dataset_streaming) # demonstrates that metrics can be slightly different due to different chunking on the backend assert isclose(gbm_v1.auc(train=True), gbm_v1.auc(valid=True), rtol=1e-4)
def test_glm_multinomial_makeGLMModel(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9]) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) rank = check_nonzero_coefs(r['coefficients'][0]) assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \ "".format(rank, mL._model_json["output"]["rank"]) m2 = glm.makeGLMModel( model=mL, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f1 = mL.predict(d) f2 = m2.predict(d) pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs[ "wendy_dreams"] = 0.123 # add extra coefficients to model coefficient try: glm.makeGLMModel(model=mL, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 371 is different from coefficient provided by user") in temp, \ "Wrong exception was received." print("glm Multinomial makeGLMModel test completed!")
def calibration_test(): df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) df["Angaus"] = df["Angaus"].asfactor() df["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(df.nrow, 1)).tolist())[0] print(df.col_names) train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) model = H2ORandomForestEstimator( ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, weights_column="Weights", calibrate_model=True, calibration_frame=calib ) model.train( x=list(range(2, train.ncol)), y="Angaus", training_frame=train ) preds = model.predict(train) # Check that calibrated probabilities were appended to the output frame assert preds.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"] # Manually scale the probabilities using GLM in R preds_calib = model.predict(calib) manual_calib_input = preds_calib["p1"].cbind(calib[["Angaus", "Weights"]]) manual_calib_input.col_names = ["p1", "response", "weights"] manual_calib_model = H2OGeneralizedLinearEstimator( family="binomial", weights_column="weights", lambda_=0, intercept=True ) manual_calib_model.train(y="response", training_frame=manual_calib_input) manual_calib_predicted = manual_calib_model.predict(preds["p1"]) pyunit_utils.compare_frames_local(preds["cal_p1"], manual_calib_predicted["p1"], prob=1)
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns, uuidNames): skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns) skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns) # this two frames should be the same pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5) skipCounter = 0 typeDict = originalFullFrame.types frameNames = originalFullFrame.names for cindex in range(len(frameNames)): if cindex not in skipped_columns: if typeDict[frameNames[cindex]] == u'enum': pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, tol=1e-10, returnResult=False) elif typeDict[frameNames[cindex]] == u'string': pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, returnResult=False) else: pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, tol=1e-10, returnResult=False) skipCounter = skipCounter + 1 # since we cannot check uuid contents, we at least need to know that the return frame contains the correct column names frameNames.extend(uuidNames) skippedFrameNames = skippedFrameIF.names for skipIndex in skipped_columns: assert frameNames[skipIndex] not in skippedFrameNames, \ "This column: {0}/{1} should have been skipped but is not!".format(frameNames[skipIndex], skipIndex)
def glm_multinomial_mojo_pojo(): PROBLEM = "multinomial" NTESTROWS = 200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM( params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def hive_import(): hdfs_name_node = pyunit_utils.hadoop_namenode() hive_host = os.getenv("HIVE_HOST") connection_url = "jdbc:hive2://{0}:10000/default".format(hive_host) krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' if krb_enabled: connection_url += ";auth=delegationToken" # read original file_url = "hdfs://{0}{1}".format( hdfs_name_node, "/user/jenkins/smalldata/chicago/chicagoCensus.csv") dataset_original = h2o.import_file(file_url) # read TABLE from Hive JDBC table_jdbc = h2o.import_sql_table(connection_url, "chicago", "", "", fetch_mode="SINGLE") table_jdbc = adapt_frame(table_jdbc, column_prefix="chicago.") pyunit_utils.compare_frames_local(dataset_original, table_jdbc, prob=1) # read TABLE from Hive FS table_direct = h2o.import_hive_table(connection_url, "chicago") table_direct = adapt_frame(table_direct) pyunit_utils.compare_frames_local(dataset_original, table_direct, prob=1)
def testOrdinalLogit(): Dtrain = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv" )) Dtrain["C21"] = Dtrain["C21"].asfactor() print("Fit model on dataset") model = glm(family="ordinal", alpha=[0.5], lambda_=[0.001], max_iterations=1000, beta_epsilon=1e-8, objective_epsilon=1e-8) model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain) predH2O = model.predict(Dtrain) r = glm.getGLMRegularizationPath(model) m2 = glm.makeGLMModel( model=model, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f2 = m2.predict(Dtrain) pyunit_utils.compare_frames_local(predH2O, f2, prob=1) coefs = r['coefficients'][0] coefs['h2o_dream'] = 3.1415 try: glm.makeGLMModel(model=model, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 189 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def testFrameTransform(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/anovaGlm/Moore.csv")) answer = h2o.import_file( path=pyunit_utils.locate("smalldata/anovaGlm/MooreTransformed.csv")) y = 'conformity' x = ['fcategory', 'partner.status'] model = H2OANOVAGLMEstimator(family='gaussian', lambda_=0, save_transformed_framekeys=True) model.train(x=x, y=y, training_frame=train) transformFrame = h2o.get_frame( model._model_json['output']['transformed_columns_key']['name']) pyunit_utils.compare_frames_local(answer[[ 'fcategory1', 'fcategory2', 'partner.status1', 'fcategory1:partner.status1', 'fcategory2:partner.status1' ]], transformFrame[[ 'fcategory_high', 'fcategory_low', 'partner.status_high', 'fcategory_high:partner.status_high', 'fcategory_low:partner.status_high' ]], prob=1)
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns): skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns) skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns) # this two frames should be the same pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5) skipCounter = 0 typeDict = originalFullFrame.types frameNames = originalFullFrame.names for cindex in range(len(frameNames)): if cindex not in skipped_columns: print("Checking column {0}...".format(cindex)) if typeDict[frameNames[cindex]] == u'enum' and cindex==10: # look at original frame continue elif typeDict[frameNames[cindex]] == u'enum' and not(skipCounter==10): pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, tol=1e-10, returnResult=False) elif typeDict[frameNames[cindex]] == u'string': pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex], skippedFrameIF[skipCounter], prob=1, returnResult=False) elif typeDict[frameNames[cindex]] == u'int': pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter].asnumeric(), prob=1, tol=1e-10, returnResult=False) skipCounter = skipCounter + 1
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', Lambda=[0.001], alpha=[0.5], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Test failed: should have throw exception of bad coefficient length!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 8 is different from coefficient provided by user ") in temp,\ "Wrong exception was received." print("makeGLMModel test passed!")
def test_modelselection_gaussian_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = allsubsets_model._model_json["output"][ "best_model_ids"] maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) result_frame_maxr = maxr_model.result() for ind in list(range(numRows)): model_from_frame_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_from_frame_allsubsets.predict(d) model_from_frame_allsubsets = h2o.get_model( modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_from_frame_allsubsets.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) model_from_frame_maxr = h2o.get_model( result_frame_maxr["model_id"][ind, 0]) pred_frame_maxr = model_from_frame_maxr.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_frame_maxr, prob=1, tol=1e-6)
def test_modelselection_backward_serialization(): d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "GLEASON" x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] # make sure duplicate runs produce same results model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward.train(training_frame=d, x=x, y=y) model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward2.train(training_frame=d, x=x, y=y) result = model_backward.result() # get result frame result2 = model_backward.result() # get result frame pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same num_models = result.nrows # number of models built one_model = h2o.get_model(result["model_id"][num_models-1, 0]) predict_frame = one_model.predict(d) tmpdir = tempfile.mkdtemp() file_dir = os.path.join(tmpdir, "predict.csv") h2o.download_csv(predict_frame, file_dir) # save one scoring frame model_path_backward = model_backward.download_model(tmpdir) # store the model h2o.remove_all() d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_backward_model = h2o.load_model(model_path_backward) result_frame_backward = loaded_backward_model.result() model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0]) pred_frame_backward = model_from_frame_backward.predict(d) pred_frame_model = h2o.import_file(file_dir) pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
def test_gam_gamColumns(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gamFrame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) gamFrame = gamFrame.drop("C1").drop("C2").drop("C11") gamFrameAns = h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C6Gam_center.csv" )) gamFrameAns = gamFrameAns.cbind( h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C7Gam_center.csv" ))) gamFrameAns = gamFrameAns.cbind( h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C8Gam_center.csv" ))) pyunit_utils.compare_frames_local(gamFrameAns, gamFrame) print("gam gamcolumn test completed successfully")
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() glmBinomialModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "glm", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_glrm_transform(): # generate training and test frames m = 1000 n = 100 k = 8 np.random.seed(12345) print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) frames = train_h2o.split_frame(ratios=[0.9]) train = frames[0] test = frames[1] glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, loss="Quadratic", seed=12345) glrm_h2o.train(x=train_h2o.names, training_frame=train) predFrame = glrm_h2o.predict(test) xFrame = glrm_h2o.transform_frame(test) glrm_h2o2 = H2OGeneralizedLowRankEstimator(k=k, loss="Quadratic", seed=12345) glrm_h2o2.train(x=train_h2o.names, training_frame=train) xFrame2 = glrm_h2o2.transform_frame(test) assert predFrame.nrows==xFrame.nrows, "predictor frame number of row: {0}, transform frame number of row: " \ "{1}".format(predFrame.nrows,xFrame.nrows) pyunit_utils.compare_frames_local(xFrame, xFrame2, prob=1.0, tol=1e-6)
def gam_binomial_mojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) train["C21"] = train["C21"].asfactor() test["C21"] = test["C21"].asfactor() x = ["C1"] y = "C21" TMPDIR = tempfile.mkdtemp() gamModel = pyunit_utils.build_save_model_generic( params, x, train, y, "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamModel._id) h2o.download_csv(test, os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def glm_fractional_binomial_mojo_pojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) x = ["log10conc"] y = "y" glmModel = pyunit_utils.build_save_model_GLM( params, x, train, y) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME) pred_h2o = pred_h2o.drop(3) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_arrange_OOM(): ''' PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse datasets of 1G. Thanks to Lauren DiPerna for finding the dataset to repo the problem. ''' df = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv")) t1 = time.time() newFrame = df.sort("sort_col") print(newFrame[0,0]) elapsed_time = time.time()-t1 print("time taken to perform sort is {0}".format(elapsed_time)) # check and make sure the sort columns contain the right value after sorting! answerFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv")) # compare sort_col from my sort with answer Frame pyunit_utils.compare_frames_local(answerFrame["sort_col"], newFrame["sort_col"]) # compare 10 more columns with answer Frame. Compare all columns will take too long allColumns = list(range(0, df.ncols)) random.shuffle(allColumns) pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]], newFrame[allColumns[0:5]])
def test_negBinomial_makeGLMModel(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: Negative Binomial") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] thetas = [0.000000001, 0.01, 0.1, 0.5, 1] for thetaO in thetas: h2o_model_log = H2OGeneralizedLinearEstimator( family="negativebinomial", link="log", alpha=0.5, Lambda=0.0001, theta=thetaO) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) predictModel = h2o_model_log.predict(h2o_data) r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath( h2o_model_log) makeModel = H2OGeneralizedLinearEstimator.makeGLMModel( model=h2o_model_log, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model predictMake = makeModel.predict(h2o_data) pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)
def test_arrange_OOM(): ''' PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse datasets of 1G. Thanks to Lauren DiPerna for finding the dataset to repo the problem. ''' df = h2o.import_file( pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv")) t1 = time.time() newFrame = df.sort("sort_col") print(newFrame[0, 0]) elapsed_time = time.time() - t1 print("time taken to perform sort is {0}".format(elapsed_time)) # check and make sure the sort columns contain the right value after sorting! answerFrame = h2o.import_file( pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv")) # compare sort_col from my sort with answer Frame pyunit_utils.compare_frames_local(answerFrame["sort_col"], newFrame["sort_col"]) # compare 10 more columns with answer Frame. Compare all columns will take too long allColumns = list(range(0, df.ncols)) random.shuffle(allColumns) pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]], newFrame[allColumns[0:5]])
def glm_ordinal_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: glmOrdinalModel = build_save_model( params, x, train, "response") # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmOrdinalModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmOrdinalModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str( type(ex) ): # only care if there is an AssertionError, ignore the others sys.exit(1)
def link_functions_tweedie_vpow(): np.random.seed(1234) n_rows = 10 data = { "X1": np.random.randn(n_rows), "X2": np.random.randn(n_rows), "X3": np.random.randn(n_rows), "W": np.random.choice([10, 20], size=n_rows), "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) } train = h2o.H2OFrame(pd.DataFrame(data)) test = train.drop("W") print(train) h2o_model = H2OGeneralizedAdditiveEstimator(family="tweedie", gam_columns=["X3"], weights_column="W", lambda_=0, tweedie_variance_power=1.5, bs=[2], tweedie_link_power=0) h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train) predict_w = h2o_model.predict(train) predict = h2o_model.predict(test) # scoring without weight column # should produce same frame pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6)
def buildModelCheckPredict(train_data, test_data, model_test_data, myy, gamX, family): numKnots = [5, 5, 5] x = ["C1", "C2"] h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[1, 1, 1], num_knots=numKnots, standardize=True, Lambda=[0], alpha=[0], max_iterations=3, compute_p_values=False, solver="irlsm") h2o_model.train(x=x, y=myy, training_frame=train_data) pred = h2o_model.predict(test_data) if pred.ncols < model_test_data.ncols: ncolT = model_test_data.ncols - 1 model_test_data = model_test_data.drop(ncolT) if (family == 'gaussian'): pyunit_utils.compare_frames_local(pred, model_test_data, prob=1) else: pred = pred.drop('predict') model_test_data = model_test_data.drop('predict') pyunit_utils.compare_frames_local(pred, model_test_data, prob=1)
def test_maxrglm_cross_validation_result_frame_model_id(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 maxrglm_model = maxrglm(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto") maxrglm_model.train(training_frame=d, x=my_x, y=my_y) resultFrame = maxrglm_model.result() numRows = resultFrame.nrows modelIDs = maxrglm_model._model_json["output"]["best_model_ids"] for ind in list(range(numRows)): model_frame = h2o.get_model(resultFrame["model_id"][ind, 0]) pred_frame = model_frame.predict(d) model_id = h2o.get_model(modelIDs[ind]['name']) pred_id = model_id.predict(d) pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)
def mergeOneEmptyFrame(): # PUBDEV-6987: merge with one empty frame and one normal frame. file1 = h2o.H2OFrame({"A1": [1], "A2": [0]}) file2 = h2o.H2OFrame({"A1": [], "A2": []}) # all_x = all_y = False, only merge rows that appear both it the right and left frames f1Mergef2 = file1.merge(file2) # right frame is empty, stall here f2Mergef1 = file2.merge( file1) # left frame is empty, should return empty frame f2Mergef2 = file2.merge(file2) # merging of empty frame with just headers # all three frames should have zero number of rows assert f1Mergef2.nrows == 0, "Expected empty rows but actual number of row is {0}!".format( f1Mergef2.nrows) assert f2Mergef1.nrows == 0, "Expected empty rows but actual number of row is {0}!".format( f2Mergef1.nrows) assert f2Mergef2.nrows == 0, "Expected empty rows but actual number of row is {0}!".format( f2Mergef2.nrows) f1Mergef2 = file1.merge( file2, all_x=True) # should contain content of file1, merge everything in f1 f2Mergef1 = file2.merge( file1, all_y=True) # should contain content of file1, merge everything in f2 assert f1Mergef2.nrow == 1, "Expected one row but actual number of row is {0}!".format( f1Mergef2.nrows) assert f2Mergef1.nrow == 1, "Expected one row but actual number of row is {0}!".format( f2Mergef1.nrows) pyunit_utils.compare_frames_local(f1Mergef2, file1, prob=1) pyunit_utils.compare_frames_local(f2Mergef1, file1, prob=1)
def test_gam_transformed_frame_serialization(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gam_frame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) tmpdir = tempfile.mkdtemp() filename = os.path.join(tmpdir, "gamXFrame.csv") h2o.download_csv(gam_frame, filename) model_path = h2o.save_model(h2o_model, tmpdir) h2o.remove_all() loaded_model = h2o.load_model(model_path) gam_frame_loaded = h2o.get_frame( loaded_model._model_json["output"]["gam_transformed_center_key"]) gam_frame_original = h2o.import_file(filename) pyunit_utils.compare_frames_local(gam_frame_loaded[2:15], gam_frame_original[2:15], prob=1, tol=1e-6) print("Test completed.")
def test_glrm_seeds(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() initMethods = [ "random", "svd", "plus_plus", "user" ] # user mode without init values is equivalent to randomized seeds = [123456789, 987654321] for initM in initMethods: # first two models are trained with same seed and should be the same glrm_h2o_seed0 = setupTrainModel(initM, seeds[0]) predict_seed0 = predGLRM(irisH2O, glrm_h2o_seed0) glrm_h2o_seed0Same = setupTrainModel(initM, seeds[0]) predict_seed0Same = predGLRM(irisH2O, glrm_h2o_seed0Same) # trained with same seed, reconstructed datasets should be the same pyunit_utils.compare_frames_local( predict_seed0[0:4], predict_seed0Same[0:4], prob=1.0 ) # compare and make sure reconstructed frames are the same # trained with different seed, reconstructed datasets should be different glrm_h2o_seed1 = setupTrainModel(initM, seeds[1]) predict_seed1 = predGLRM(irisH2O, glrm_h2o_seed1) assert not (pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed1[0:4], prob=1.0, returnResult=True)), \ "GLRM return same results with different random seed."
def glm_gamma_offset_mojo(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = "DPROS" x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"] x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"] params = {'family': "gamma", 'offset_column': "C1"} offset = pyunit_utils.random_dataset_real_only(train.nrow, 1, realR=3, misFrac=0, randSeed=12345) train = train.cbind(offset) tmpdir = tempfile.mkdtemp() glm_gamma_model = pyunit_utils.build_save_model_generic( params, x, train, y, "glm", tmpdir) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id) h2o.download_csv(train[x_offset], os.path.join( tmpdir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glm_gamma_model, tmpdir, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10) # compare mojo and model predict
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="gaussian" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001) # generate random dataset dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not(cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count+1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5]) m.train(training_frame=d, x=myX, y=myY) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Should have throw exception of bad coefficient length" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 9 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_glrm_seeds(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() initMethods = ["random", "svd", "plus_plus", "user"] # user mode without init values is equivalent to randomized seeds = [123456789, 987654321] for initM in initMethods: # first two models are trained with same seed and should be the same glrm_h2o_seed0 = setupTrainModel(initM, seeds[0]) predict_seed0 = predGLRM(irisH2O, glrm_h2o_seed0) glrm_h2o_seed0Same = setupTrainModel(initM, seeds[0]) predict_seed0Same = predGLRM(irisH2O, glrm_h2o_seed0Same) # trained with same seed, reconstructed datasets should be the same pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed0Same[0:4], prob=1.0) # compare and make sure reconstructed frames are the same # trained with different seed, reconstructed datasets should be different glrm_h2o_seed1 = setupTrainModel(initM, seeds[1]) predict_seed1 = predGLRM(irisH2O, glrm_h2o_seed1) assert not (pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed1[0:4], prob=1.0, returnResult=True)), \ "GLRM return same results with different random seed."
def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate("results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def test_parquet_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquetNoSkip = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")) pyunit_utils.compare_frames_local(csv, parquetNoSkip, prob=1) # should be the same here. path = pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet") skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol//2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def glm_multinomial_mojo_pojo(): PROBLEM="multinomial" NTESTROWS=200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def pubdev_3567(): train = h2o.import_file(pyunit_utils.locate("smalldata/jira/frameA2.csv"), header=1) test = h2o.import_file(pyunit_utils.locate("smalldata/jira/frameB2.csv"), header=1) mergedAns = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2.csv"), header=1) mergedAnsLeft = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2Left.csv"), header=1) mergedAnsRight = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2Right.csv"), header=1) merged = train.merge(test,by_x=["A"],by_y=["A"],method="auto") # default is radix print(merged[0,0]) mergedLeft = train.merge(test,by_x=["A"],by_y=["A"],all_x=True) print(mergedLeft[0,0]) mergedRight = train.merge(test,by_x=["A"],by_y=["A"],all_y=True) # new feature print(mergedRight[0,0]) pyunit_utils.compare_frames_local(mergedAnsRight, mergedRight, 1, tol=1e-10) pyunit_utils.compare_frames_local(mergedAns, merged, 1, tol=1e-10) pyunit_utils.compare_frames_local(mergedAnsLeft, mergedLeft, 1, tol=1e-10)
def checkCorrectSkips(csvfile, originalFrame): skippedFrameUF = h2o.upload_file(csvfile) skippedFrameIF = h2o.import_file(csvfile) # this two frames should be the same pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=1) # test with null skipped_column list skippedFrameUF2 = h2o.upload_file(csvfile, skipped_columns=[]) skippedFrameIF2 = h2o.import_file(csvfile, skipped_columns=[]) # this two frames should be the same pyunit_utils.compare_frames_local(skippedFrameUF2, skippedFrameIF2, prob=1) # frame from not skipped_columns specification and empty skipped_columns should return same result pyunit_utils.compare_frames_local(skippedFrameUF2, skippedFrameIF, prob=1) # compare skipped frame with originalFrame assert originalFrame.ncol==skippedFrameUF.ncol, \ "Expected return frame column number: {0}, actual frame column number: " \ "{1}".format((originalFrame.ncol, skippedFrameUF.ncol)) pyunit_utils.compare_frames_local_svm(originalFrame, skippedFrameIF2, prob=1)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def random_seeds_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: # train H2O XGBoost first higgs_h2o_train = h2o.import_file(pyunit_utils.locate('bigdata/laptop/higgs_train_imbalance_100k.csv')) higgs_h2o_train[0] = higgs_h2o_train[0].asfactor() higgs_h2o_test = h2o.import_file(pyunit_utils.locate('bigdata/laptop/higgs_test_imbalance_100k.csv')) higgs_h2o_test[0] = higgs_h2o_test[0].asfactor() myX = list(higgs_h2o_train.names) y = "response" myX.remove(y) # run with old same random seed h2oParams = {"ntrees":10, "max_depth":4, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": 100, "seed":-12345} print("Model 1 trainged with old seed {0}.".format(h2oParams['seed'])) # train model 1 with same seed from previous runs h2oModel1 = H2OXGBoostEstimator(**h2oParams) # gather, print and save performance numbers for h2o model h2oModel1.train(x=myX, y=y, training_frame=higgs_h2o_train) h2oPredict1 = h2oModel1.predict(higgs_h2o_test) h2oModel1_2 = H2OXGBoostEstimator(**h2oParams) # gather, print and save performance numbers for h2o model h2oModel1_2.train(x=myX, y=y, training_frame=higgs_h2o_train) h2oPredict1_2 = h2oModel1_2.predict(higgs_h2o_test) # run with new random seed seed2 = random.randint(1, 1073741824) # seed cannot be long, must be int size h2oParams2 = {"ntrees":100, "max_depth":10, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": 100, "seed":seed2} print("Model 2 trainged with new seed {0}.".format(h2oParams2['seed'])) h2oModel2 = H2OXGBoostEstimator(**h2oParams2) # gather, print and save performance numbers for h2o model h2oModel2.train(x=myX, y=y, training_frame=higgs_h2o_train) h2oPredict2 = h2oModel2.predict(higgs_h2o_test) # Result comparison in terms of prediction output. In theory, h2oModel1 should be the same as saved run # compare the logloss assert abs(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"]- h2oModel1_2._model_json["output"]["training_metrics"]._metric_json["logloss"])<1e-10, \ "Model outputs should be the same with same seeds but are not! Expected: {0}, actual: " \ "{1}".format(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"], h2oModel1_2._model_json["output"]["training_metrics"]._metric_json["logloss"]) assert abs(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"]- h2oModel2._model_json["output"]["training_metrics"]._metric_json["logloss"])>1e-10, \ "Model outputs should be different with same seeds but are not!" # compare some prediction probabilities model1Pred = [h2oPredict1[0,"p1"], h2oPredict1[1,"p1"], h2oPredict1[2,"p1"], h2oPredict1[3,"p1"]] model1_2Pred = [h2oPredict1_2[0,"p1"], h2oPredict1_2[1,"p1"], h2oPredict1_2[2,"p1"], h2oPredict1_2[3,"p1"]] assert model1Pred==model1_2Pred, "Model 1 should have same predictions as previous with same seed but do not." try: pyunit_utils.compare_frames_local(h2oPredict1[['p0', 'p1']], h2oPredict2[['p0', 'p1']], prob=0.1, tol=1e-6) # should fail assert False, "Predict frames from two different seeds should be different but is not. FAIL!" except: assert True # train multiple native XGBoost nativeTrain = genDMatrix(higgs_h2o_train, myX, y) nativeTest = genDMatrix(higgs_h2o_test, myX, y) h2o.remove_all() nativeParam = {'eta': h2oParams["learn_rate"], 'objective': 'binary:logistic', 'booster': 'gbtree', 'max_depth': h2oParams["max_depth"], 'seed': h2oParams["seed"], 'min_child_weight':h2oParams["min_rows"], 'colsample_bytree':h2oParams["col_sample_rate_per_tree"],'alpha':0.0, 'nrounds':h2oParams["ntrees"]} nativeModel1 = xgb.train(params=nativeParam, dtrain=nativeTrain) nativePred1 = nativeModel1.predict(data=nativeTest) nativeModel1_2 = xgb.train(params=nativeParam, dtrain=nativeTrain) nativePred1_2 = nativeModel1_2.predict(data=nativeTest) nativeParam2 = {'eta': h2oParams["learn_rate"], 'objective': 'binary:logistic', 'booster': 'gbtree', 'max_depth': h2oParams["max_depth"], 'seed': h2oParams2["seed"], 'min_child_weight':h2oParams["min_rows"], 'colsample_bytree':h2oParams["col_sample_rate_per_tree"],'alpha':0.0, 'nrounds':h2oParams["ntrees"]} nativeModel2 = xgb.train(params=nativeParam2, dtrain=nativeTrain , num_boost_round=h2oParams["ntrees"]) nativePred2 = nativeModel2.predict(data=nativeTest) # nativeModel1 and nativeModel2 should generate the same results while nativeModel3 should provide different results # compare prediction probability and they should agree if they use the same seed nativePreds1_2 = [nativePred1_2[0], nativePred1_2[1], nativePred1_2[2], nativePred1_2[3]] nativePreds1 = [nativePred1[0], nativePred1[1], nativePred1[2], nativePred1[3]] # plot_tree(nativeModel1,num_trees=4) # plt.show() for ind in range(len(nativePreds1)): assert abs(nativePreds1_2[ind]-nativePreds1[ind])<1e-7, "Native XGBoost Model 1 should have same predictions" \ " as previous with same seed but do not." for ind in range(4): assert abs(nativePred1[ind]-nativePred2[ind])>=1e-6, \ "Native XGBoost model 1 prediction prob: {0} and native XGBoost model 3 prediction prob: {1}. " \ "They are too similar.".format(nativePred1[ind], nativePred2[ind]) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)