def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glrm_mojo(): h2o.remove_all() train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv")) predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv")) predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv")) x = train.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random") glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_"+predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict print("Comparing mojo x Factor and model x Factor for 1 iterations") pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict print("Comparing mojo x Factor and model x Factor for 10 iterations") pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def custom_distribution_mojo_test(): rows = 2000 df = random_dataset('binomial', verbose=False, NTESTROWS=rows) df['response'] = df['response'].asnumeric() train = df[rows:, :] test = df[:rows, :] x = list(set(df.names) - {"response"}) params = { 'ntrees': 10, 'max_depth': 4, 'distribution': "custom", 'custom_distribution_func': custom_distribution_bernoulli() } my_gbm = build_save_model_GBM(params, x, train, "response") mojo_name = getMojoName(my_gbm._id) tmp_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", mojo_name)) h2o.download_csv(test[x], os.path.join( tmp_dir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = mojo_predict( my_gbm, tmp_dir, mojo_name) # load model and perform predict assert compare_frames_local( pred_h2o, pred_mojo, returnResult=True ), "Predictions from model and MOJO model are not the same."
def glm_multinomial_mojo_pojo(): PROBLEM = "multinomial" NTESTROWS = 200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM( params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str(type(ex)): # only care if there is an AssertionError, ignore the others sys.exit(1)
def runComparisonTests(autoEncoder, actFun, missingValuesHandling, setAllFactor, train, test, x): params = set_params(actFun, missingValuesHandling, setAllFactor, autoEncoder) # set deeplearning model parameters if autoEncoder: try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model except Exception as err: if not ("Trying to predict with an unstable model" in err.args[0]): raise Exception( 'Deeplearning autoencoder model failed to build. Fix it.') return else: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() glmBinomialModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "glm", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def glm_gamma_offset_mojo(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = "DPROS" x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"] x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"] params = {'family': "gamma", 'offset_column': "C1"} offset = pyunit_utils.random_dataset_real_only(train.nrow, 1, realR=3, misFrac=0, randSeed=12345) train = train.cbind(offset) tmpdir = tempfile.mkdtemp() glm_gamma_model = pyunit_utils.build_save_model_generic( params, x, train, y, "glm", tmpdir) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id) h2o.download_csv(train[x_offset], os.path.join( tmpdir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glm_gamma_model, tmpdir, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10) # compare mojo and model predict
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x): # set deeplearning model parameters params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) if auto_encoder: try: # build and save mojo model deeplearning_model = build_save_model(params, x, train) except Exception as err: if not("Trying to predict with an unstable model" in err.args[0]): raise Exception('Deeplearning autoencoder model failed to build. Fix it.') return else: # build and save mojo model deeplearning_model = build_save_model(params, x, train) # save test file, h2o predict/mojo use same file h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # load model and perform predict pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME) pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME) # save model for debugging h2o.save_model(deeplearning_model, path=TMPDIR, force=True) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="gaussian" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001) # generate random dataset dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not(cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count+1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def gam_binomial_mojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) train["C21"] = train["C21"].asfactor() test["C21"] = test["C21"].asfactor() x = ["C1"] y = "C21" TMPDIR = tempfile.mkdtemp() gamModel = pyunit_utils.build_save_model_generic( params, x, train, y, "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamModel._id) h2o.download_csv(test, os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6) # print("Comparing pojo predict and h2o predict...") # pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str( type(ex) ): # only care if there is an AssertionError, ignore the others sys.exit(1)
def glm_fractional_binomial_mojo_pojo(): params = set_params() train = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) test = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv")) x = ["log10conc"] y = "y" glmModel = pyunit_utils.build_save_model_GLM( params, x, train, y) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME) pred_h2o = pred_h2o.drop(3) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1): x = train.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("dataset transform is {0}.".format(transformN)) # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=K, transform=transformN, max_iterations=1000, seed=12345) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) # assert glrmTrainFactor.nrows==train.nrows, \ # "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) mojoDir = save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) h2o.download_csv(test[x], os.path.join( mojoDir, 'in.csv')) # save test file, h2o predict/mojo use same file frameID, mojoXFactor = pyunit_utils.mojo_predict( glrmModel, mojoDir, MOJONAME, glrmReconstruct=False) # save mojo XFactor print("Comparing mojo x Factor and model x Factor ...") if transformN == "NONE" or not ( compare_predict ): # bad performance with no transformation on dataset pyunit_utils.check_data_rows(mojoXFactor, glrmTrainFactor, num_rows=mojoXFactor.nrow) else: pyunit_utils.compare_data_rows(mojoXFactor, glrmTrainFactor, index_list=range( 2, mojoXFactor.nrows - 1), tol=tol) if compare_predict: # only compare reconstructed data frames with numerical data pred2 = glrmModel.predict(test) # predict using mojo pred1 = glrmModel.predict( train) # predict using the X from A=X*Y from training predictDiff = pyunit_utils.compute_frame_diff(train, pred1) mojoDiff = pyunit_utils.compute_frame_diff(train, pred2) print( "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}" .format(mojoDiff, predictDiff))
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_" + predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) # scoring with 2 iterations should be shorter than scoring with 8000 iterations starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=8000) # save mojo predict time1000 = time.time() - starttime starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2) # save mojo predict time10 = time.time() - starttime print( "Time taken for 2 iterations: {0}s. Time taken for 8000 iterations: {1}s." .format(time10, time1000))
def drf_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'max_depth': 4} TMPDIR = tempfile.mkdtemp() my_gbm = pyunit_utils.build_save_model_generic(params, x, train, "response", "DRF", TMPDIR) MOJONAME = pyunit_utils.getMojoName(my_gbm._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def runComparisonTests(autoEncoder, probleyType): params = set_params(autoEncoder) # set deeplearning model parameters df = random_dataset(probleyType) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def gbm_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4} my_gbm = pyunit_utils.build_save_model_GBM(params, x, train, "response") MOJONAME = pyunit_utils.getMojoName(my_gbm._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def pca_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", ncol_upper=8000, ncol_lower=5000, missing_fraction=0.001, seed=1234) train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = [ "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE" ] # pyunit test loop through transform for transformN in transform_types: # compare H2O predict and mojo predict for all dataset transform types pcaModel = H2OPrincipalComponentAnalysisEstimator( k=3, transform=transformN, seed=1234, impute_missing=True, use_all_factor_levels=False) pcaModel.train(x=x, training_frame=train) pyunit_utils.saveModelMojo(pcaModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(pcaModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( pcaModel, TMPDIR, MOJONAME) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def glm_multinomial_mojo_pojo(): PROBLEM="multinomial" NTESTROWS=200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "gaussian" params = set_params() df = pyunit_utils.random_dataset(PROBLEM, seed=2, missing_fraction=0.5) dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not (cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count + 1 if count >= num_gam_cols: break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] exclude_list = {"response", params["gam_columns"][0]} x = list(set(df.names) - exclude_list) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test, os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def test_negativebinomial_GAM_MOJO(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) test = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: Negative Binomial") myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] params = set_params() TMPDIR = tempfile.mkdtemp() gamModel = pyunit_utils.build_save_model_generic( params, myX, h2o_data, "GLEASON", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamModel._id) h2o.download_csv(test[myX], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def runComparisonTests(autoEncoder, probleyType): params = set_params(autoEncoder) # set deeplearning model parameters df = random_dataset(probleyType) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) if autoEncoder: try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model except Exception as err: if not ("Trying to predict with an unstable model" in err.args[0]): raise Exception( 'Deeplearning autoencoder model failed to build. Fix it.') return else: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)