def test_glrm_transform(): # generate training and test frames m = 1000 n = 100 k = 8 np.random.seed(12345) print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) frames = train_h2o.split_frame(ratios=[0.9]) train = frames[0] test = frames[1] glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, loss="Quadratic", seed=12345) glrm_h2o.train(x=train_h2o.names, training_frame=train) predFrame = glrm_h2o.predict(test) xFrame = glrm_h2o.transform_frame(test) glrm_h2o2 = H2OGeneralizedLowRankEstimator(k=k, loss="Quadratic", seed=12345) glrm_h2o2.train(x=train_h2o.names, training_frame=train) xFrame2 = glrm_h2o2.transform_frame(test) assert predFrame.nrows==xFrame.nrows, "predictor frame number of row: {0}, transform frame number of row: " \ "{1}".format(predFrame.nrows,xFrame.nrows) pyunit_utils.compare_frames_local(xFrame, xFrame2, prob=1.0, tol=1e-6)
def glrm_cancar(): print("Importing cancar.csv data...") cancarH2O = h2o.upload_file( pyunit_utils.locate("smalldata/glrm_test/cancar.csv")) cancarH2O.describe() print("Building GLRM model with init = PlusPlus:\n") glrm_pp = H2OGeneralizedLowRankEstimator(k=4, transform="NONE", init="PlusPlus", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_pp.train(x=cancarH2O.names, training_frame=cancarH2O) glrm_pp.show() print("Building GLRM model with init = SVD:\n") glrm_svd = H2OGeneralizedLowRankEstimator(k=4, transform="NONE", init="SVD", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_svd.train(x=cancarH2O.names, training_frame=cancarH2O) glrm_svd.show()
def glrm_long_run(): run_time_ms = [] iterations = [] acs_orig = h2o.upload_file(path=pyunit_utils.locate( "bigdata/laptop/milsongs/milsongs-cls-train.csv.gz")) # run GLRM with max_runtime_ms restriction. acs_model = H2OGeneralizedLowRankEstimator(k=10, transform='STANDARDIZE', loss='Quadratic', multi_loss="Categorical", model_id="clients_core_glrm", regularization_x="L2", regularization_y="L1", gamma_x=0.2, gamma_y=0.5, init="SVD", seed=1234) acs_model.train(x=acs_orig.names, training_frame=acs_orig, max_runtime_secs=60) print("Run time in s with max_runtime_secs of 60 second: " "{0}".format( (acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time']) / 1000.0)) print("number of iterations: {0}".format( acs_model._model_json['output']['iterations'])) # let glrm run with restriction on iteration number. acs_model = H2OGeneralizedLowRankEstimator(k=10, transform='STANDARDIZE', loss='Quadratic', multi_loss="Categorical", model_id="clients_core_glrm", regularization_x="L2", regularization_y="L1", gamma_x=0.2, gamma_y=0.5, init="SVD", seed=1234) acs_model.train(x=acs_orig.names, training_frame=acs_orig) run_time_ms.append(acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time']) iterations.append(acs_model._model_json['output']['iterations']) print("Run time in s with no max time restrication: " "{0}".format( (acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time']) / 1000.0)) print("number of iterations: {0}".format( acs_model._model_json['output']['iterations'])) sys.stdout.flush()
def glrm_export(): print("###### GLRM ######") frame = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) model = H2OGeneralizedLowRankEstimator(k=8, init="svd", recover_svd=True) model.train(x=frame.names, training_frame=frame) expect_error(model.download_pojo, model="GLRM", format='POJO') model.download_mojo(path=RESULT_DIR)
def glrm_iris_error_message(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) rank = 3 gx = 0.5 gy = 0.5 trans = "STANDARDIZE" print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans) try: glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans, model_id="one", representation_name="one") glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("representation_name and model_id cannot use the same string" in temp), "Wrong exception was received."
def grid_glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() transform_opts = ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"] k_opts = random.sample(list(range(1, 8)), 3) size_of_hyper_space = len(transform_opts) * len(k_opts) hyper_parameters = OrderedDict() hyper_parameters["k"] = k_opts hyper_parameters["transform"] = transform_opts gx = random.uniform(0, 1) gy = random.uniform(0, 1) print("H2O GLRM with , gamma_x = " + str(gx) + ", gamma_y = " + str(gy) +\ ", hyperparameters = " + str(hyper_parameters)) gs = H2OGridSearch(H2OGeneralizedLowRankEstimator(loss="Quadratic", gamma_x=gx, gamma_y=gy), hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=irisH2O) for model in gs: assert isinstance(model, H2OGeneralizedLowRankEstimator) print(gs.get_grid(sort_by="mse")) #print gs.hit_ratio_table() assert len(gs) == size_of_hyper_space total_grid_space = list( map(list, itertools.product(*list(hyper_parameters.values())))) for model in gs.models: combo = [model.parms['k']['actual_value'] ] + [model.parms['transform']['actual_value']] assert combo in total_grid_space total_grid_space.remove(combo)
def glrm_mojo(): h2o.remove_all() train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv")) predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv")) predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv")) x = train.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random") glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_"+predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict print("Comparing mojo x Factor and model x Factor for 1 iterations") pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict print("Comparing mojo x Factor and model x Factor for 10 iterations") pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def glrm_iris(): print("Importing iris.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) irisH2O.describe() print("@@@@@@ Building PCA with GramSVD...\n") glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21) glrmPCA.train(x=irisH2O.names, training_frame=irisH2O) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True, seed=21) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"], glrm_h2o._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-6) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True) # check to make sure maximum proportional variance <= 1 assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \ "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glrm_subset(): acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-cls-train.csv.gz")) seeds = [2297378124, 3849570216, 6733652048, 8915337442, 8344418400, 9416580152, 2598632624, 4977008454, 8273228579, 8185554539, 3219125000, 2998879373, 7707012513, 5786923379, 5029788935, 935945790, 7092607078, 9305834745, 6173975590, 5397294255] run_time_ms = [] iterations = [] objective = [] num_runs = 10 # number of times to repeat experiments for ind in range(num_runs): acs_model = H2OGeneralizedLowRankEstimator(k = 10, transform = 'STANDARDIZE', loss = 'Quadratic', multi_loss="Categorical", model_id="clients_core_glrm", regularization_x="L2", regularization_y="L1", gamma_x=0.2, gamma_y=0.5, init="SVD", max_iterations = 1000, seed=seeds[ind % len(seeds)]) acs_model.train(x = acs_orig.names, training_frame= acs_orig, seed=seeds[ind % len(seeds)]) run_time_ms.append(acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time']) iterations.append(acs_model._model_json['output']['iterations']) objective.append(acs_model._model_json['output']['objective']) print("Run time in ms: {0}".format(run_time_ms)) print("number of iterations: {0}".format(iterations)) print("objective function value: {0}".format(objective)) sys.stdout.flush()
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) pca_h2o = H2OPCA(k=4, transform="STANDARDIZE") pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O) pca_h2o.summary() pca_h2o.show() print("H2O GLRM on standardized data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0, init="SVD", recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show() # compare table values and make sure they are the same between PCA and GLRM assert pyunit_utils.equal_2d_tables(pca_h2o._model_json["output"]["importance"]._cell_values, glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \ "PCA and GLRM variance metrics do not agree. Fix it please." sys.stdout.flush()
def glrm_subset(): acs_orig = h2o.upload_file(path=pyunit_utils.locate( "bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types=(['enum'] + ['numeric'] * 149)) acs_full = acs_orig.drop("ZCTA5") acs_model = H2OGeneralizedLowRankEstimator(k=10, transform='STANDARDIZE', loss='Quadratic', regularization_x='Quadratic', regularization_y='L1', gamma_x=0.25, gamma_y=0.5, max_iterations=1) acs_model.train(x=acs_full.names, training_frame=acs_full) zcta_arch_x = h2o.get_frame( acs_model._model_json['output']['representation_name']) print(zcta_arch_x) acs_zcta_col = acs_orig["ZCTA5"].asfactor() idx = ((acs_zcta_col == '10065') | # Manhattan, NY (Upper East Side)\n", (acs_zcta_col == '11219') | # Manhattan, NY (East Harlem)\n", (acs_zcta_col == '66753') | # McCune, KS\n", (acs_zcta_col == '84104') | # Salt Lake City, UT\n", (acs_zcta_col == '94086') | # Sunnyvale, CA\n", (acs_zcta_col == '95014')) # Cupertino, CA\n", print(zcta_arch_x[idx, [0, 1]])
def glrm_start(grid_id, export_dir, train, params, hyper_parameters): grid = H2OGridSearch(H2OGeneralizedLowRankEstimator(seed=42), grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=export_dir, parallelism=2) grid.start(x=train.names, training_frame=train, **params) return grid
def glrm_nnmf(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print("Run GLRM with non-negative regularization") initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) glrm_h2o.show() print("Check that X and Y matrices are non-negative") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
def glrm_catagorical_bug_fix(): trainData = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip")) testData = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) glrmModel = H2OGeneralizedLowRankEstimator(k=4) glrmModel.train(x=trainData.names, training_frame=trainData) predV = glrmModel.predict(testData) print(predV)
def glrm_benign(): print "Importing benign.csv data..." benignH2O = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) benignH2O.describe() for i in range(8,16,2): print "H2O GLRM with rank " + str(i) + " decomposition:\n" glrm_h2o = H2OGeneralizedLowRankEstimator(k=i, init="SVD", recover_svd=True) glrm_h2o.train(x=benignH2O.names, training_frame=benignH2O) glrm_h2o.show()
def glrm_catagorical_bug_fix(): print("Importing prostate.csv data...") tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10, col_header=["q1", "q2", "q3"], row_header=range(10), table_header="Table 2") # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM cell_values = [[ 'Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 58.295918367346935, 8.810102040816325, 11.344897959183678, 6.285714285714286 ], [ 'Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 69.35514018691589, 7.538224299065424, 10.087757009345797, 5.6168224299065415 ], [ 'Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 64.68, 75.892, 10.812000000000001, 7.44 ], [ 'Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 68.77083333333333, 13.368750000000002, 49.44583333333334, 5.9375 ], [ 'Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 69.04901960784314, 16.140196078431373, 11.510000000000005, 7.235294117647059 ]] col_header = [ 'dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack', 'racena', 'racewhite', 'capsuleno', 'capsuleyes', 'dcapsno', 'dcapsyes', 'age', 'psa', 'vol', 'gleason' ] row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5'] table_header = "archetypes" correct_archetype = H2OTwoDimTable(cell_values=cell_values, col_header=col_header, row_header=row_header, table_header=table_header) prostateF = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234) glrm_h2o.train(x=prostateF.names, training_frame=prostateF) glrm_h2o.show() assert pyunit_utils.equal_2d_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values, correct_archetype._cell_values, tolerance=1e-4), \ "GLRM model archetypes generated from current model are not correct."
def setupTrainModel(initM, seed): rank = 3 gx = 0.25 gy = 0.25 trans = "STANDARDIZE" return H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans, init=initM, seed=seed)
def hdfs_glrm(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing USArrests.csv data and saving for validation...") arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_full.describe() totobs = arrests_full.nrow * arrests_full.ncol train_err = [0]*len(missing_ratios) valid_err = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio)) arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_miss = arrests_miss.insert_missing_values(fraction=ratio) arrests_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) arrests_glrm = H2OGeneralizedLowRankEstimator(k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6) arrests_glrm.train(x=arrests_miss.names, training_frame=arrests_miss, validation_frame=arrests_full) arrests_glrm.show() # Check imputed data and error metrics glrm_obj = arrests_glrm._model_json['output']['objective'] train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr'] train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj) assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero" assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero" train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data" assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs) train_err[i] = train_numerr valid_err[i] = valid_numerr for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1): x = train.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("dataset transform is {0}.".format(transformN)) # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=K, transform=transformN, max_iterations=1000, seed=12345) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) # assert glrmTrainFactor.nrows==train.nrows, \ # "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) mojoDir = save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) h2o.download_csv(test[x], os.path.join( mojoDir, 'in.csv')) # save test file, h2o predict/mojo use same file frameID, mojoXFactor = pyunit_utils.mojo_predict( glrmModel, mojoDir, MOJONAME, glrmReconstruct=False) # save mojo XFactor print("Comparing mojo x Factor and model x Factor ...") if transformN == "NONE" or not ( compare_predict ): # bad performance with no transformation on dataset pyunit_utils.check_data_rows(mojoXFactor, glrmTrainFactor, num_rows=mojoXFactor.nrow) else: pyunit_utils.compare_data_rows(mojoXFactor, glrmTrainFactor, index_list=range( 2, mojoXFactor.nrows - 1), tol=tol) if compare_predict: # only compare reconstructed data frames with numerical data pred2 = glrmModel.predict(test) # predict using mojo pred1 = glrmModel.predict( train) # predict using the X from A=X*Y from training predictDiff = pyunit_utils.compute_frame_diff(train, pred1) mojoDiff = pyunit_utils.compute_frame_diff(train, pred2) print( "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}" .format(mojoDiff, predictDiff))
def pca_wideDataset_rotterdam_glrm(): tol = 2e-5 h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) # special test with GLRM. Need use_all_levels to be true print("------ Testing GLRM PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform="DEMEAN", seed=12345, use_all_factor_levels=True) gramSVD.train(x=x, training_frame=rotterdamH2O) glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform="DEMEAN", seed=12345, init="Random", recover_svd=True, regularization_x="None", regularization_y="None", max_iterations=11) glrmPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print( "@@@@@@ Comparing eigenvectors and eigenvalues between GramSVD and GLRM...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1, check_all=False) # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=tol, check_sign=True, check_all=False)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_" + predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) # scoring with 2 iterations should be shorter than scoring with 8000 iterations starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=8000) # save mojo predict time1000 = time.time() - starttime starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2) # save mojo predict time10 = time.time() - starttime print( "Time taken for 2 iterations: {0}s. Time taken for 8000 iterations: {1}s." .format(time10, time1000))
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print( "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = H2OGeneralizedLowRankEstimator( k=3, loss="Quadratic", loss_by_col=["Absolute", "Huber"], loss_by_col_idx=[0, 3], regularization_x="None", regularization_y="None") glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) # glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:, 0]) + np.square( fit_diff[:, 1]) + np.square(fit_diff[:, 2]) def huber(a): return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:, 3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(obj_val)
def glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisTest = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader_bad_cnames.csv")) rank = 3 gx = 0.5 gy = 0.5 trans = "STANDARDIZE" print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans) glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) print("Impute original data from XY decomposition") # and expect warnings buffer = StringIO( ) # redirect warning messages to string buffer for later analysis sys.stderr = buffer h2o_pred = glrm_h2o.predict(irisTest) warn_phrase = "UserWarning" warn_string_of_interest = "missing column" sys.stderr = sys.__stderr__ # redirect it back to stdout. try: # for python 2.7 if len(buffer.buflist) > 0: for index in range(len(buffer.buflist)): print("*** captured warning message: {0}".format( buffer.buflist[index])) assert (warn_phrase in buffer.buflist[index]) and ( warn_string_of_interest in buffer.buflist[index]) except: # for python 3. warns = buffer.getvalue() print("*** captured warning message: {0}".format(warns)) assert (warn_phrase in warns) and (warn_string_of_interest in warns)
def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate( "results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame( model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps=1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def glrm_arrests(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print "H2O initial Y matrix:\n" initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268], [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]] initial_y_h2o = h2o.H2OFrame(initial_y) initial_y_h2o.show() print "H2O GLRM on de-meaned data with quadratic loss:\n" glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_y=initial_y_h2o, recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show()
def glrm_iris(): print "Importing iris_wheader.csv data..." irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]: rank = random.randint(1, 7) gx = random.uniform(0, 1) gy = random.uniform(0, 1) print "H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str( gx) + ", gamma_y = " + str(gy) + ", transform = " + trans glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) glrm_h2o.show() print "Impute original data from XY decomposition" pred_h2o = glrm_h2o.predict(irisH2O) pred_h2o.describe()
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column_header = params.get('column_header') if len(column_header) > 0: df = df[int(column_header):] from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator glrm_model = H2OGeneralizedLowRankEstimator( expand_user_y=to_bool(params.get('expand_user_y')), gamma_x=float(params.get('gamma_x')), gamma_y=float(params.get('gamma_y')), ignore_const_cols=to_bool(params.get('ignore_const_cols')), impute_original=to_bool(params.get('impute_original')), init=str(params.get('init')), init_step_size=float(params.get('init_step_size')), k=int(params.get('k')), loss=str(params.get('loss')), max_iterations=int(params.get('max_iterations')), max_runtime_secs=float(params.get('max_runtime_secs')), max_updates=int(params.get('max_updates')), min_step_size=float(params.get('min_step_size')), multi_loss=str(params.get('multi_loss')), period=int(params.get('period')), recover_svd=to_bool(params.get('recover_svd')), regularization_x=str(params.get('regularization_x')), regularization_y=str(params.get('regularization_y')), score_each_iteration=to_bool(params.get('score_each_iteration')), seed=int(params.get('seed')), svd_method=str(params.get('svd_method'))) glrm_model.train(training_frame=df) glrm_model.show() save_model(params, glrm_model.model_id) return {'frame_id': frame_id, 'model_id': glrm_model.model_id}
# In[ ]: # Import and parse WHD 2014-2015 labor violations data whd_zcta = h2o.import_file( path=os.path.realpath("../data/whd_zcta_cleaned.zip"), col_types=(["enum"] * 7 + ["numeric"] * 97)) whd_zcta["zcta5_cd"] = whd_zcta["zcta5_cd"].asfactor() whd_zcta.describe() # In[ ]: # Run GLRM to reduce ZCTA demographics to 10 archetypes acs_model = H2OGeneralizedLowRankEstimator(k=10, transform="STANDARDIZE", loss="Quadratic", regularization_x="Quadratic", regularization_y="L1", gamma_x=0.25, gamma_y=0.5, max_iterations=100) acs_model.train(x=acs_full.names, training_frame=acs_full) print acs_model # In[ ]: # Plot objective function value each iteration acs_model_score = acs_model.score_history() plt.xlabel("Iteration") plt.ylabel("Objective") plt.title("Objective Function Value per Iteration") plt.plot(acs_model_score["iteration"], acs_model_score["objective"]) plt.show()
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs is set to be too short. See PUBDEV-4802. ''' global model_within_max_runtime seed = 12345 # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA, pca_method=Power training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=Randomized model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Randomized", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=GLRM model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model, training1_data]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE", recover_svd=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) if sum(model_within_max_runtime) > 0: sys.exit(1)