def test_glrm_transform():
    # generate training and test frames
    m = 1000
    n = 100
    k = 8
    np.random.seed(12345)

    print("Uploading random uniform matrix with rows = " + str(m) +
          " and cols = " + str(n))
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    frames = train_h2o.split_frame(ratios=[0.9])
    train = frames[0]
    test = frames[1]

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              loss="Quadratic",
                                              seed=12345)
    glrm_h2o.train(x=train_h2o.names, training_frame=train)
    predFrame = glrm_h2o.predict(test)
    xFrame = glrm_h2o.transform_frame(test)

    glrm_h2o2 = H2OGeneralizedLowRankEstimator(k=k,
                                               loss="Quadratic",
                                               seed=12345)
    glrm_h2o2.train(x=train_h2o.names, training_frame=train)
    xFrame2 = glrm_h2o2.transform_frame(test)

    assert predFrame.nrows==xFrame.nrows, "predictor frame number of row: {0}, transform frame number of row: " \
                                                "{1}".format(predFrame.nrows,xFrame.nrows)
    pyunit_utils.compare_frames_local(xFrame, xFrame2, prob=1.0, tol=1e-6)
Пример #2
0
def glrm_cancar():
    print("Importing cancar.csv data...")
    cancarH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/glrm_test/cancar.csv"))
    cancarH2O.describe()

    print("Building GLRM model with init = PlusPlus:\n")
    glrm_pp = H2OGeneralizedLowRankEstimator(k=4,
                                             transform="NONE",
                                             init="PlusPlus",
                                             loss="Quadratic",
                                             regularization_x="None",
                                             regularization_y="None",
                                             max_iterations=1000)
    glrm_pp.train(x=cancarH2O.names, training_frame=cancarH2O)
    glrm_pp.show()

    print("Building GLRM model with init = SVD:\n")
    glrm_svd = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="NONE",
                                              init="SVD",
                                              loss="Quadratic",
                                              regularization_x="None",
                                              regularization_y="None",
                                              max_iterations=1000)
    glrm_svd.train(x=cancarH2O.names, training_frame=cancarH2O)
    glrm_svd.show()
Пример #3
0
def glrm_long_run():
    run_time_ms = []
    iterations = []

    acs_orig = h2o.upload_file(path=pyunit_utils.locate(
        "bigdata/laptop/milsongs/milsongs-cls-train.csv.gz"))

    # run GLRM with max_runtime_ms restriction.
    acs_model = H2OGeneralizedLowRankEstimator(k=10,
                                               transform='STANDARDIZE',
                                               loss='Quadratic',
                                               multi_loss="Categorical",
                                               model_id="clients_core_glrm",
                                               regularization_x="L2",
                                               regularization_y="L1",
                                               gamma_x=0.2,
                                               gamma_y=0.5,
                                               init="SVD",
                                               seed=1234)
    acs_model.train(x=acs_orig.names,
                    training_frame=acs_orig,
                    max_runtime_secs=60)

    print("Run time in s with max_runtime_secs of 60 second: "
          "{0}".format(
              (acs_model._model_json['output']['end_time'] -
               acs_model._model_json['output']['start_time']) / 1000.0))
    print("number of iterations: {0}".format(
        acs_model._model_json['output']['iterations']))

    # let glrm run with restriction on iteration number.
    acs_model = H2OGeneralizedLowRankEstimator(k=10,
                                               transform='STANDARDIZE',
                                               loss='Quadratic',
                                               multi_loss="Categorical",
                                               model_id="clients_core_glrm",
                                               regularization_x="L2",
                                               regularization_y="L1",
                                               gamma_x=0.2,
                                               gamma_y=0.5,
                                               init="SVD",
                                               seed=1234)
    acs_model.train(x=acs_orig.names, training_frame=acs_orig)
    run_time_ms.append(acs_model._model_json['output']['end_time'] -
                       acs_model._model_json['output']['start_time'])
    iterations.append(acs_model._model_json['output']['iterations'])

    print("Run time in s with no max time restrication: "
          "{0}".format(
              (acs_model._model_json['output']['end_time'] -
               acs_model._model_json['output']['start_time']) / 1000.0))
    print("number of iterations: {0}".format(
        acs_model._model_json['output']['iterations']))

    sys.stdout.flush()
def glrm_export():
    print("###### GLRM ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    model = H2OGeneralizedLowRankEstimator(k=8, init="svd", recover_svd=True)
    model.train(x=frame.names, training_frame=frame)
    expect_error(model.download_pojo, model="GLRM", format='POJO')
    model.download_mojo(path=RESULT_DIR)
Пример #5
0
def glrm_iris_error_message():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    rank = 3
    gx = 0.5
    gy = 0.5
    trans = "STANDARDIZE"
    print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) +
          ", gamma_y = " + str(gy) + ", transform = " + trans)
    try:
        glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank,
                                                  loss="Quadratic",
                                                  gamma_x=gx,
                                                  gamma_y=gy,
                                                  transform=trans,
                                                  model_id="one",
                                                  representation_name="one")
        glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("representation_name and model_id cannot use the same string"
                in temp), "Wrong exception was received."
Пример #6
0
def grid_glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()
    transform_opts = ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]
    k_opts = random.sample(list(range(1, 8)), 3)
    size_of_hyper_space = len(transform_opts) * len(k_opts)
    hyper_parameters = OrderedDict()
    hyper_parameters["k"] = k_opts
    hyper_parameters["transform"] = transform_opts
    gx = random.uniform(0, 1)
    gy = random.uniform(0, 1)
    print("H2O GLRM with , gamma_x = " + str(gx) + ", gamma_y = " + str(gy) +\
          ", hyperparameters = " + str(hyper_parameters))

    gs = H2OGridSearch(H2OGeneralizedLowRankEstimator(loss="Quadratic",
                                                      gamma_x=gx,
                                                      gamma_y=gy),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=irisH2O)
    for model in gs:
        assert isinstance(model, H2OGeneralizedLowRankEstimator)
    print(gs.get_grid(sort_by="mse"))
    #print gs.hit_ratio_table()

    assert len(gs) == size_of_hyper_space
    total_grid_space = list(
        map(list, itertools.product(*list(hyper_parameters.values()))))
    for model in gs.models:
        combo = [model.parms['k']['actual_value']
                 ] + [model.parms['transform']['actual_value']]
        assert combo in total_grid_space
        total_grid_space.remove(combo)
Пример #7
0
def glrm_mojo():
    h2o.remove_all()
    train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv"))
    predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv"))
    predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv"))

    x = train.names
    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random")
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # save mojo model
    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_"+predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 1 iterations")
    pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 10 iterations")
    pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
Пример #8
0
def glrm_iris():
  print("Importing iris.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  irisH2O.describe()

  print("@@@@@@  Building PCA with GramSVD...\n")
  glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21)
  glrmPCA.train(x=irisH2O.names, training_frame=irisH2O)

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic",transform="STANDARDIZE", recover_svd=True,  seed=21)
  glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

  # compare singular values and stuff with GramSVD
  print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["importance"],
                                           glrm_h2o._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-6)
  print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")

  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(glrmPCA._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["eigenvectors"],
                                           glrm_h2o._model_json["output"]["names"], tolerance=1e-6,check_sign=True)

  # check to make sure maximum proportional variance <= 1
  assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \
    "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression")       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
Пример #10
0
def glrm_subset():
  acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-cls-train.csv.gz"))

  seeds = [2297378124, 3849570216, 6733652048, 8915337442, 8344418400, 9416580152, 2598632624, 4977008454, 8273228579,
           8185554539, 3219125000, 2998879373, 7707012513, 5786923379, 5029788935, 935945790, 7092607078, 9305834745,
           6173975590, 5397294255]
  run_time_ms = []
  iterations = []
  objective = []
  num_runs = 10         # number of times to repeat experiments



  for ind in range(num_runs):
    acs_model = H2OGeneralizedLowRankEstimator(k = 10,
                                                 transform = 'STANDARDIZE',
                                                 loss = 'Quadratic',
                                                 multi_loss="Categorical",
                                                 model_id="clients_core_glrm",
                                                 regularization_x="L2",
                                                 regularization_y="L1",
                                                 gamma_x=0.2,
                                                 gamma_y=0.5,
                                                 init="SVD",
                                                 max_iterations = 1000,
                                                 seed=seeds[ind % len(seeds)])
    acs_model.train(x = acs_orig.names, training_frame= acs_orig, seed=seeds[ind % len(seeds)])
    run_time_ms.append(acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time'])
    iterations.append(acs_model._model_json['output']['iterations'])
    objective.append(acs_model._model_json['output']['objective'])
  
  print("Run time in ms: {0}".format(run_time_ms))
  print("number of iterations: {0}".format(iterations))
  print("objective function value: {0}".format(objective))
  sys.stdout.flush()
def glrm_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    pca_h2o = H2OPCA(k=4, transform="STANDARDIZE")
    pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
    pca_h2o.summary()
    pca_h2o.show()

    print("H2O GLRM on standardized data with quadratic loss:\n")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="STANDARDIZE",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="SVD",
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()

    # compare table values and make sure they are the same between PCA and GLRM
    assert pyunit_utils.equal_2d_tables(pca_h2o._model_json["output"]["importance"]._cell_values,
                                        glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \
      "PCA and GLRM variance metrics do not agree.  Fix it please."

    sys.stdout.flush()
Пример #12
0
def glrm_subset():
    acs_orig = h2o.upload_file(path=pyunit_utils.locate(
        "bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"),
                               col_types=(['enum'] + ['numeric'] * 149))

    acs_full = acs_orig.drop("ZCTA5")
    acs_model = H2OGeneralizedLowRankEstimator(k=10,
                                               transform='STANDARDIZE',
                                               loss='Quadratic',
                                               regularization_x='Quadratic',
                                               regularization_y='L1',
                                               gamma_x=0.25,
                                               gamma_y=0.5,
                                               max_iterations=1)

    acs_model.train(x=acs_full.names, training_frame=acs_full)
    zcta_arch_x = h2o.get_frame(
        acs_model._model_json['output']['representation_name'])
    print(zcta_arch_x)

    acs_zcta_col = acs_orig["ZCTA5"].asfactor()

    idx = ((acs_zcta_col == '10065') |  # Manhattan, NY (Upper East Side)\n",
           (acs_zcta_col == '11219') |  # Manhattan, NY (East Harlem)\n",
           (acs_zcta_col == '66753') |  # McCune, KS\n",
           (acs_zcta_col == '84104') |  # Salt Lake City, UT\n",
           (acs_zcta_col == '94086') |  # Sunnyvale, CA\n",
           (acs_zcta_col == '95014'))  # Cupertino, CA\n",

    print(zcta_arch_x[idx, [0, 1]])
def glrm_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2OGeneralizedLowRankEstimator(seed=42),
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir,
                         parallelism=2)
    grid.start(x=train.names, training_frame=train, **params)
    return grid
Пример #14
0
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) +
          " and cols = " + str(n))
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print("Run GLRM with non-negative regularization")
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              loss="Quadratic",
                                              regularization_x="NonNegative",
                                              regularization_y="NonNegative",
                                              gamma_x=1,
                                              gamma_y=1)
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    glrm_h2o.show()

    print("Check that X and Y matrices are non-negative")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
Пример #15
0
def glrm_catagorical_bug_fix():
    trainData = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    testData = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    glrmModel = H2OGeneralizedLowRankEstimator(k=4)
    glrmModel.train(x=trainData.names, training_frame=trainData)
    predV = glrmModel.predict(testData)
    print(predV)
Пример #16
0
def glrm_benign():
  print "Importing benign.csv data..."
  benignH2O = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
  benignH2O.describe()

  for i in range(8,16,2):
    print "H2O GLRM with rank " + str(i) + " decomposition:\n"
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=i, init="SVD", recover_svd=True)
    glrm_h2o.train(x=benignH2O.names, training_frame=benignH2O)
    glrm_h2o.show()
def glrm_catagorical_bug_fix():
    print("Importing prostate.csv data...")

    tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10,
                          col_header=["q1", "q2", "q3"],
                          row_header=range(10),
                          table_header="Table 2")

    # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM
    cell_values = [[
        'Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0,
        58.295918367346935, 8.810102040816325, 11.344897959183678,
        6.285714285714286
    ],
                   [
                       'Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0,
                       1.0, 0.0, 69.35514018691589, 7.538224299065424,
                       10.087757009345797, 5.6168224299065415
                   ],
                   [
                       'Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
                       1.0, 0.0, 64.68, 75.892, 10.812000000000001, 7.44
                   ],
                   [
                       'Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0,
                       1.0, 0.0, 68.77083333333333, 13.368750000000002,
                       49.44583333333334, 5.9375
                   ],
                   [
                       'Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0,
                       1.0, 0.0, 69.04901960784314, 16.140196078431373,
                       11.510000000000005, 7.235294117647059
                   ]]
    col_header = [
        'dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack',
        'racena', 'racewhite', 'capsuleno', 'capsuleyes', 'dcapsno',
        'dcapsyes', 'age', 'psa', 'vol', 'gleason'
    ]
    row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5']
    table_header = "archetypes"
    correct_archetype = H2OTwoDimTable(cell_values=cell_values,
                                       col_header=col_header,
                                       row_header=row_header,
                                       table_header=table_header)

    prostateF = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234)
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF)
    glrm_h2o.show()

    assert pyunit_utils.equal_2d_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values,
                                        correct_archetype._cell_values, tolerance=1e-4), \
        "GLRM model archetypes generated from current model are not correct."
Пример #18
0
def setupTrainModel(initM, seed):
    rank = 3
    gx = 0.25
    gy = 0.25
    trans = "STANDARDIZE"

    return H2OGeneralizedLowRankEstimator(k=rank,
                                          loss="Quadratic",
                                          gamma_x=gx,
                                          gamma_y=gy,
                                          transform=trans,
                                          init=initM,
                                          seed=seed)
Пример #19
0
def hdfs_glrm():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()

    print("Importing USArrests.csv data and saving for validation...")
    arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests_full.describe()
    totobs = arrests_full.nrow * arrests_full.ncol
    train_err = [0]*len(missing_ratios)
    valid_err = [0]*len(missing_ratios)

    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio))
        arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
        arrests_miss = arrests_miss.insert_missing_values(fraction=ratio)
        arrests_miss.describe()

        print("H2O GLRM with {0}% missing entries".format(100*ratio))
        arrests_glrm = H2OGeneralizedLowRankEstimator(k=4,
                                                      ignore_const_cols=False,
                                                      loss="Quadratic",
                                                      regularization_x="None",
                                                      regularization_y="None",
                                                      init="PlusPlus",
                                                      max_iterations=10,
                                                      min_step_size=1e-6)
        arrests_glrm.train(x=arrests_miss.names,
                           training_frame=arrests_miss,
                           validation_frame=arrests_full)
        arrests_glrm.show()

        # Check imputed data and error metrics
        glrm_obj = arrests_glrm._model_json['output']['objective']
        train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj)
        assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero"
        assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero"

        train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data"
        assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs)

        train_err[i] = train_numerr
        valid_err[i] = valid_numerr
        
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1):
    x = train.names
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("dataset transform is {0}.".format(transformN))
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=K,
                                               transform=transformN,
                                               max_iterations=1000,
                                               seed=12345)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    # assert glrmTrainFactor.nrows==train.nrows, \
    #     "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    mojoDir = save_GLRM_mojo(glrmModel)  # save mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    h2o.download_csv(test[x], os.path.join(
        mojoDir, 'in.csv'))  # save test file, h2o predict/mojo use same file

    frameID, mojoXFactor = pyunit_utils.mojo_predict(
        glrmModel, mojoDir, MOJONAME,
        glrmReconstruct=False)  # save mojo XFactor
    print("Comparing mojo x Factor and model x Factor ...")

    if transformN == "NONE" or not (
            compare_predict
    ):  # bad performance with no transformation on dataset
        pyunit_utils.check_data_rows(mojoXFactor,
                                     glrmTrainFactor,
                                     num_rows=mojoXFactor.nrow)
    else:
        pyunit_utils.compare_data_rows(mojoXFactor,
                                       glrmTrainFactor,
                                       index_list=range(
                                           2, mojoXFactor.nrows - 1),
                                       tol=tol)

    if compare_predict:  # only compare reconstructed data frames with numerical data
        pred2 = glrmModel.predict(test)  # predict using mojo
        pred1 = glrmModel.predict(
            train)  # predict using the X from A=X*Y from training

        predictDiff = pyunit_utils.compute_frame_diff(train, pred1)
        mojoDiff = pyunit_utils.compute_frame_diff(train, pred2)
        print(
            "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}"
            .format(mojoDiff, predictDiff))
Пример #21
0
def pca_wideDataset_rotterdam_glrm():
    tol = 2e-5
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    # special test with GLRM.  Need use_all_levels to be true
    print("------  Testing GLRM PCA --------")
    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform="DEMEAN",
                     seed=12345,
                     use_all_factor_levels=True)
    gramSVD.train(x=x, training_frame=rotterdamH2O)

    glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                             transform="DEMEAN",
                                             seed=12345,
                                             init="Random",
                                             recover_svd=True,
                                             regularization_x="None",
                                             regularization_y="None",
                                             max_iterations=11)
    glrmPCA.train(x=x, training_frame=rotterdamH2O)

    # compare singular values and stuff with GramSVD
    print(
        "@@@@@@  Comparing eigenvectors and eigenvalues between GramSVD and GLRM...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1,
        check_all=False)

    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=tol,
        check_sign=True,
        check_all=False)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     seed=1234)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3,
                                               transform=transformN,
                                               max_iterations=10,
                                               seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel)  # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(
        glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100)  # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_" + predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    # scoring with 2 iterations should be shorter than scoring with 8000 iterations
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME,
                       glrmIterNumber=8000)  # save mojo predict
    time1000 = time.time() - starttime
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2)  # save mojo predict
    time10 = time.time() - starttime
    print(
        "Time taken for 2 iterations: {0}s.  Time taken for 8000 iterations: {1}s."
        .format(time10, time1000))
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()

    print(
        "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = H2OGeneralizedLowRankEstimator(
        k=3,
        loss="Quadratic",
        loss_by_col=["Absolute", "Huber"],
        loss_by_col_idx=[0, 3],
        regularization_x="None",
        regularization_y="None")
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    #   glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()

    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:, 0]) + np.square(
        fit_diff[:, 1]) + np.square(fit_diff[:, 2])

    def huber(a):
        return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5

    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:, 3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(obj_val)
Пример #24
0
def glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisTest = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader_bad_cnames.csv"))

    rank = 3
    gx = 0.5
    gy = 0.5
    trans = "STANDARDIZE"
    print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) +
          ", gamma_y = " + str(gy) + ", transform = " + trans)
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank,
                                              loss="Quadratic",
                                              gamma_x=gx,
                                              gamma_y=gy,
                                              transform=trans)
    glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

    print("Impute original data from XY decomposition")  # and expect warnings
    buffer = StringIO(
    )  # redirect warning messages to string buffer for later analysis
    sys.stderr = buffer

    h2o_pred = glrm_h2o.predict(irisTest)

    warn_phrase = "UserWarning"
    warn_string_of_interest = "missing column"
    sys.stderr = sys.__stderr__  # redirect it back to stdout.
    try:  # for python 2.7
        if len(buffer.buflist) > 0:
            for index in range(len(buffer.buflist)):
                print("*** captured warning message: {0}".format(
                    buffer.buflist[index]))
                assert (warn_phrase in buffer.buflist[index]) and (
                    warn_string_of_interest in buffer.buflist[index])
    except:  # for python 3.
        warns = buffer.getvalue()
        print("*** captured warning message: {0}".format(warns))
        assert (warn_phrase in warns) and (warn_string_of_interest in warns)
Пример #25
0
def test_load_glrm():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    g_model = H2OGeneralizedLowRankEstimator(k=3)
    g_model.train(x=irisH2O.names, training_frame=irisH2O)
    yarch_old = g_model.archetypes()
    x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
    predOld = g_model.predict(irisH2O)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..",
                     "results"))

    try:
        TMPDIR = pyunit_utils.locate(
            "results")  # find directory path to results folder
    except:
        os.makedirs(TMPDIR)
    h2o.save_model(g_model, path=TMPDIR, force=True)  # save model
    full_path_filename = os.path.join(TMPDIR, g_model._id)

    h2o.remove(g_model)
    model_reloaded = h2o.load_model(full_path_filename)
    pred = model_reloaded.predict(irisH2O)
    yarch = model_reloaded.archetypes()
    x = h2o.get_frame(
        model_reloaded._model_json["output"]["representation_name"])

    # assert difference between old and new are close, archetypes should be the same
    pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
    pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
    for k in range(3):
        pyunit_utils.equal_two_arrays(yarch_old[k],
                                      yarch[k],
                                      eps=1e-4,
                                      tolerance=1e-10)

    print("glrm model successfully loaded...")
Пример #26
0
def glrm_arrests():
    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    print "H2O initial Y matrix:\n"
    initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268],
                 [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]]
    initial_y_h2o = h2o.H2OFrame(initial_y)
    initial_y_h2o.show()

    print "H2O GLRM on de-meaned data with quadratic loss:\n"
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="DEMEAN",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()
Пример #27
0
def glrm_iris():
    print "Importing iris_wheader.csv data..."
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]:
        rank = random.randint(1, 7)
        gx = random.uniform(0, 1)
        gy = random.uniform(0, 1)

        print "H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(
            gx) + ", gamma_y = " + str(gy) + ", transform = " + trans
        glrm_h2o = H2OGeneralizedLowRankEstimator(k=rank,
                                                  loss="Quadratic",
                                                  gamma_x=gx,
                                                  gamma_y=gy,
                                                  transform=trans)
        glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)
        glrm_h2o.show()

        print "Impute original data from XY decomposition"
        pred_h2o = glrm_h2o.predict(irisH2O)
        pred_h2o.describe()
Пример #28
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)
    column_header = params.get('column_header')
    if len(column_header) > 0:
        df = df[int(column_header):]

    from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
    glrm_model = H2OGeneralizedLowRankEstimator(
        expand_user_y=to_bool(params.get('expand_user_y')),
        gamma_x=float(params.get('gamma_x')),
        gamma_y=float(params.get('gamma_y')),
        ignore_const_cols=to_bool(params.get('ignore_const_cols')),
        impute_original=to_bool(params.get('impute_original')),
        init=str(params.get('init')),
        init_step_size=float(params.get('init_step_size')),
        k=int(params.get('k')),
        loss=str(params.get('loss')),
        max_iterations=int(params.get('max_iterations')),
        max_runtime_secs=float(params.get('max_runtime_secs')),
        max_updates=int(params.get('max_updates')),
        min_step_size=float(params.get('min_step_size')),
        multi_loss=str(params.get('multi_loss')),
        period=int(params.get('period')),
        recover_svd=to_bool(params.get('recover_svd')),
        regularization_x=str(params.get('regularization_x')),
        regularization_y=str(params.get('regularization_y')),
        score_each_iteration=to_bool(params.get('score_each_iteration')),
        seed=int(params.get('seed')),
        svd_method=str(params.get('svd_method')))
    glrm_model.train(training_frame=df)
    glrm_model.show()
    save_model(params, glrm_model.model_id)

    return {'frame_id': frame_id, 'model_id': glrm_model.model_id}
Пример #29
0
# In[ ]:

# Import and parse WHD 2014-2015 labor violations data
whd_zcta = h2o.import_file(
    path=os.path.realpath("../data/whd_zcta_cleaned.zip"),
    col_types=(["enum"] * 7 + ["numeric"] * 97))
whd_zcta["zcta5_cd"] = whd_zcta["zcta5_cd"].asfactor()
whd_zcta.describe()

# In[ ]:

# Run GLRM to reduce ZCTA demographics to 10 archetypes
acs_model = H2OGeneralizedLowRankEstimator(k=10,
                                           transform="STANDARDIZE",
                                           loss="Quadratic",
                                           regularization_x="Quadratic",
                                           regularization_y="L1",
                                           gamma_x=0.25,
                                           gamma_y=0.5,
                                           max_iterations=100)
acs_model.train(x=acs_full.names, training_frame=acs_full)
print acs_model

# In[ ]:

# Plot objective function value each iteration
acs_model_score = acs_model.score_history()
plt.xlabel("Iteration")
plt.ylabel("Objective")
plt.title("Objective Function Value per Iteration")
plt.plot(acs_model_score["iteration"], acs_model_score["objective"])
plt.show()
Пример #30
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs
    is set to be too short.  See PUBDEV-4802.
    '''
    global model_within_max_runtime
    seed = 12345

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA, pca_method=Power
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=Randomized
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Randomized",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=GLRM
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="GLRM",
                   compute_metrics=True,
                   use_all_factor_levels=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE",
                                           recover_svd=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)