예제 #1
0
def gbm_on_hive():
    connection_url = "jdbc:hive2://localhost:10000/default"
    krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true'
    use_token = os.getenv('KRB_USE_TOKEN', 'false').lower() == 'true'
    if krb_enabled:
        if use_token:
            connection_url += ";auth=delegationToken"
        else:
            connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]')

    select_query = "select * from airlinestest"
    username = "******"
    password = ""

    # read from S3
    airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip")
    # read from Hive Streaming
    airlines_dataset_streaming = h2o.import_sql_select(connection_url, select_query, username, password, fetch_mode="SINGLE")
    airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming)

    # datasets should be identical from user's point of view
    pyunit_utils.compare_frames_local(airlines_dataset_original, airlines_dataset_streaming, 1)

    airlines_X_col_names = airlines_dataset_streaming.col_names[:-2]
    airlines_y_col_name = airlines_dataset_streaming.col_names[-2]
    gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000)
    gbm_v1.train(airlines_X_col_names, airlines_y_col_name,
                 training_frame=airlines_dataset_streaming, validation_frame=airlines_dataset_streaming)
    # demonstrates that metrics can be slightly different due to different chunking on the backend
    assert isclose(gbm_v1.auc(train=True), gbm_v1.auc(valid=True), rtol=1e-4)
def test_glm_multinomial_makeGLMModel():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9])
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    rank = check_nonzero_coefs(r['coefficients'][0])
    assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \
                                                     "".format(rank, mL._model_json["output"]["rank"])
    m2 = glm.makeGLMModel(
        model=mL, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f1 = mL.predict(d)
    f2 = m2.predict(d)
    pyunit_utils.compare_frames_local(f1, f2, prob=1)

    coefs = r['coefficients'][0]
    coefs[
        "wendy_dreams"] = 0.123  # add extra coefficients to model coefficient

    try:
        glm.makeGLMModel(model=mL, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
           ("model coefficient length 371 is different from coefficient provided by user") in temp, \
            "Wrong exception was received."
        print("glm Multinomial makeGLMModel test completed!")
예제 #3
0
def calibration_test():
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
    df["Angaus"] = df["Angaus"].asfactor()
    df["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(df.nrow, 1)).tolist())[0]
    print(df.col_names)
    train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
    
    model = H2ORandomForestEstimator(
        ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
        weights_column="Weights",
        calibrate_model=True, calibration_frame=calib
    )
    model.train(
        x=list(range(2, train.ncol)), 
        y="Angaus", training_frame=train
    )

    preds = model.predict(train)

    # Check that calibrated probabilities were appended to the output frame
    assert preds.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]

    # Manually scale the probabilities using GLM in R
    preds_calib = model.predict(calib)
    manual_calib_input = preds_calib["p1"].cbind(calib[["Angaus", "Weights"]])
    manual_calib_input.col_names = ["p1", "response", "weights"]
    manual_calib_model = H2OGeneralizedLinearEstimator(
        family="binomial", weights_column="weights", lambda_=0, intercept=True
    )
    manual_calib_model.train(y="response", training_frame=manual_calib_input)
    manual_calib_predicted = manual_calib_model.predict(preds["p1"])

    pyunit_utils.compare_frames_local(preds["cal_p1"], manual_calib_predicted["p1"], prob=1)
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns, uuidNames):
    skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns)
    skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns)  # this two frames should be the same
    pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5)

    skipCounter = 0
    typeDict = originalFullFrame.types
    frameNames = originalFullFrame.names
    for cindex in range(len(frameNames)):
        if cindex not in skipped_columns:
            if typeDict[frameNames[cindex]] == u'enum':
                pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex],
                                                                    skippedFrameIF[skipCounter], prob=1, tol=1e-10,
                                                                    returnResult=False)
            elif typeDict[frameNames[cindex]] == u'string':
                pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex],
                                                                      skippedFrameIF[skipCounter], prob=1,
                                                                      returnResult=False)
            else:
                pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter],
                                                               prob=1, tol=1e-10, returnResult=False)
            skipCounter = skipCounter + 1

    # since we cannot check uuid contents, we at least need to know that the return frame contains the correct column names
    frameNames.extend(uuidNames)
    skippedFrameNames = skippedFrameIF.names

    for skipIndex in skipped_columns:
        assert frameNames[skipIndex] not in skippedFrameNames, \
            "This column: {0}/{1} should have been skipped but is not!".format(frameNames[skipIndex], skipIndex)
def glm_multinomial_mojo_pojo():
    PROBLEM = "multinomial"
    NTESTROWS = 200
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(
        params, x, train, "response")  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmMultinomialModel, TMPDIR,
        MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR,
                                          MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
예제 #6
0
def hive_import():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hive_host = os.getenv("HIVE_HOST")
    connection_url = "jdbc:hive2://{0}:10000/default".format(hive_host)
    krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true'
    if krb_enabled:
        connection_url += ";auth=delegationToken"

    # read original
    file_url = "hdfs://{0}{1}".format(
        hdfs_name_node, "/user/jenkins/smalldata/chicago/chicagoCensus.csv")
    dataset_original = h2o.import_file(file_url)

    # read TABLE from Hive JDBC
    table_jdbc = h2o.import_sql_table(connection_url,
                                      "chicago",
                                      "",
                                      "",
                                      fetch_mode="SINGLE")
    table_jdbc = adapt_frame(table_jdbc, column_prefix="chicago.")
    pyunit_utils.compare_frames_local(dataset_original, table_jdbc, prob=1)

    # read TABLE from Hive FS
    table_direct = h2o.import_hive_table(connection_url, "chicago")
    table_direct = adapt_frame(table_direct)
    pyunit_utils.compare_frames_local(dataset_original, table_direct, prob=1)
def testOrdinalLogit():
    Dtrain = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv"
        ))
    Dtrain["C21"] = Dtrain["C21"].asfactor()

    print("Fit model on dataset")
    model = glm(family="ordinal",
                alpha=[0.5],
                lambda_=[0.001],
                max_iterations=1000,
                beta_epsilon=1e-8,
                objective_epsilon=1e-8)
    model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain)
    predH2O = model.predict(Dtrain)
    r = glm.getGLMRegularizationPath(model)
    m2 = glm.makeGLMModel(
        model=model, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f2 = m2.predict(Dtrain)
    pyunit_utils.compare_frames_local(predH2O, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['h2o_dream'] = 3.1415

    try:
        glm.makeGLMModel(model=model, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 189 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
def testFrameTransform():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/anovaGlm/Moore.csv"))
    answer = h2o.import_file(
        path=pyunit_utils.locate("smalldata/anovaGlm/MooreTransformed.csv"))
    y = 'conformity'
    x = ['fcategory', 'partner.status']

    model = H2OANOVAGLMEstimator(family='gaussian',
                                 lambda_=0,
                                 save_transformed_framekeys=True)
    model.train(x=x, y=y, training_frame=train)
    transformFrame = h2o.get_frame(
        model._model_json['output']['transformed_columns_key']['name'])
    pyunit_utils.compare_frames_local(answer[[
        'fcategory1', 'fcategory2', 'partner.status1',
        'fcategory1:partner.status1', 'fcategory2:partner.status1'
    ]],
                                      transformFrame[[
                                          'fcategory_high', 'fcategory_low',
                                          'partner.status_high',
                                          'fcategory_high:partner.status_high',
                                          'fcategory_low:partner.status_high'
                                      ]],
                                      prob=1)
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns):
    skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns)
    skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns)  # this two frames should be the same
    pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5)

    skipCounter = 0
    typeDict = originalFullFrame.types
    frameNames = originalFullFrame.names
    for cindex in range(len(frameNames)):
        if cindex not in skipped_columns:
            print("Checking column {0}...".format(cindex))
            if typeDict[frameNames[cindex]] == u'enum' and cindex==10: # look at original frame
                continue

            elif typeDict[frameNames[cindex]] == u'enum' and not(skipCounter==10):
                pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex],
                                                                    skippedFrameIF[skipCounter], prob=1, tol=1e-10,
                                                                    returnResult=False)
            elif typeDict[frameNames[cindex]] == u'string':
                pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex],
                                                         skippedFrameIF[skipCounter], prob=1,
                                                         returnResult=False)
            elif typeDict[frameNames[cindex]] == u'int':
                pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter].asnumeric(),
                                                  prob=1, tol=1e-10, returnResult=False)
            skipCounter = skipCounter + 1
def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',
            Lambda=[0.001],
            alpha=[0.5],
            solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Test failed: should have throw exception of bad coefficient length!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 8 is different from coefficient provided by user ") in temp,\
            "Wrong exception was received."
        print("makeGLMModel test passed!")
def test_modelselection_gaussian_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = allsubsets_model._model_json["output"][
        "best_model_ids"]
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_maxr = maxr_model.result()
    for ind in list(range(numRows)):
        model_from_frame_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_from_frame_allsubsets.predict(d)
        model_from_frame_allsubsets = h2o.get_model(
            modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_from_frame_allsubsets.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_allsubsets,
                                          prob=1)
        model_from_frame_maxr = h2o.get_model(
            result_frame_maxr["model_id"][ind, 0])
        pred_frame_maxr = model_from_frame_maxr.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_frame_maxr,
                                          prob=1,
                                          tol=1e-6)
def test_modelselection_backward_serialization():
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "GLEASON"
    x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    # make sure duplicate runs produce same results
    model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                      lambda_=0, theta=0.01)
    model_backward.train(training_frame=d, x=x, y=y)
    model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                    lambda_=0, theta=0.01)
    model_backward2.train(training_frame=d, x=x, y=y)
    result = model_backward.result()    # get result frame
    result2 = model_backward.result()    # get result frame
    pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same

    num_models = result.nrows           # number of models built
    one_model = h2o.get_model(result["model_id"][num_models-1, 0])
    predict_frame = one_model.predict(d)
    tmpdir = tempfile.mkdtemp()
    file_dir = os.path.join(tmpdir, "predict.csv")
    h2o.download_csv(predict_frame, file_dir) # save one scoring frame
    model_path_backward = model_backward.download_model(tmpdir) # store the model

    h2o.remove_all()
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_backward_model = h2o.load_model(model_path_backward)    
    result_frame_backward = loaded_backward_model.result()

    model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0])
    pred_frame_backward = model_from_frame_backward.predict(d)
    pred_frame_model = h2o.import_file(file_dir)
    pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
예제 #13
0
def test_gam_gamColumns():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gamFrame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    gamFrame = gamFrame.drop("C1").drop("C2").drop("C11")
    gamFrameAns = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C6Gam_center.csv"
        ))
    gamFrameAns = gamFrameAns.cbind(
        h2o.import_file(
            pyunit_utils.locate(
                "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C7Gam_center.csv"
            )))
    gamFrameAns = gamFrameAns.cbind(
        h2o.import_file(
            pyunit_utils.locate(
                "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C8Gam_center.csv"
            )))
    pyunit_utils.compare_frames_local(gamFrameAns, gamFrame)
    print("gam gamcolumn test completed successfully")
예제 #14
0
def glm_binomial_mojo_pojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "binomial"
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})
    TMPDIR = tempfile.mkdtemp()
    glmBinomialModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "glm",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id)

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmBinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_glrm_transform():
    # generate training and test frames
    m = 1000
    n = 100
    k = 8
    np.random.seed(12345)

    print("Uploading random uniform matrix with rows = " + str(m) +
          " and cols = " + str(n))
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    frames = train_h2o.split_frame(ratios=[0.9])
    train = frames[0]
    test = frames[1]

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              loss="Quadratic",
                                              seed=12345)
    glrm_h2o.train(x=train_h2o.names, training_frame=train)
    predFrame = glrm_h2o.predict(test)
    xFrame = glrm_h2o.transform_frame(test)

    glrm_h2o2 = H2OGeneralizedLowRankEstimator(k=k,
                                               loss="Quadratic",
                                               seed=12345)
    glrm_h2o2.train(x=train_h2o.names, training_frame=train)
    xFrame2 = glrm_h2o2.transform_frame(test)

    assert predFrame.nrows==xFrame.nrows, "predictor frame number of row: {0}, transform frame number of row: " \
                                                "{1}".format(predFrame.nrows,xFrame.nrows)
    pyunit_utils.compare_frames_local(xFrame, xFrame2, prob=1.0, tol=1e-6)
예제 #16
0
def gam_binomial_mojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    train["C21"] = train["C21"].asfactor()
    test["C21"] = test["C21"].asfactor()
    x = ["C1"]
    y = "C21"

    TMPDIR = tempfile.mkdtemp()
    gamModel = pyunit_utils.build_save_model_generic(
        params, x, train, y, "gam", TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamModel._id)

    h2o.download_csv(test, os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
def glm_fractional_binomial_mojo_pojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    x = ["log10conc"]
    y = "y"

    glmModel = pyunit_utils.build_save_model_GLM(
        params, x, train, y)  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME)
    pred_h2o = pred_h2o.drop(3)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_arrange_OOM():
    '''
    PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse
    datasets of 1G.

    Thanks to Lauren DiPerna for finding the dataset to repo the problem.
    '''

    df = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv"))
    t1 = time.time()
    newFrame = df.sort("sort_col")
    print(newFrame[0,0])
    elapsed_time = time.time()-t1
    print("time taken to perform sort is {0}".format(elapsed_time))

    # check and make sure the sort columns contain the right value after sorting!
    answerFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv"))

    # compare sort_col from my sort with answer Frame
    pyunit_utils.compare_frames_local(answerFrame["sort_col"], newFrame["sort_col"])

    # compare 10 more columns with answer Frame.  Compare all columns will take too long
    allColumns = list(range(0, df.ncols))
    random.shuffle(allColumns)
    pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]], newFrame[allColumns[0:5]])
def test_negBinomial_makeGLMModel():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    print("Testing for family: Negative Binomial")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    thetas = [0.000000001, 0.01, 0.1, 0.5, 1]
    for thetaO in thetas:
        h2o_model_log = H2OGeneralizedLinearEstimator(
            family="negativebinomial",
            link="log",
            alpha=0.5,
            Lambda=0.0001,
            theta=thetaO)
        h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)
        predictModel = h2o_model_log.predict(h2o_data)
        r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(
            h2o_model_log)
        makeModel = H2OGeneralizedLinearEstimator.makeGLMModel(
            model=h2o_model_log, coefs=r['coefficients']
            [0])  # model generated from setting coefficients to model
        predictMake = makeModel.predict(h2o_data)
        pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)
예제 #20
0
def test_arrange_OOM():
    '''
    PUBDEV-5990 customer reported that h2o.arrange (sorting) takes way more memory than normal for sparse
    datasets of 1G.

    Thanks to Lauren DiPerna for finding the dataset to repo the problem.
    '''

    df = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/jira/sort_OOM.csv"))
    t1 = time.time()
    newFrame = df.sort("sort_col")
    print(newFrame[0, 0])
    elapsed_time = time.time() - t1
    print("time taken to perform sort is {0}".format(elapsed_time))

    # check and make sure the sort columns contain the right value after sorting!
    answerFrame = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/jira/sort_OOM_answer.csv"))

    # compare sort_col from my sort with answer Frame
    pyunit_utils.compare_frames_local(answerFrame["sort_col"],
                                      newFrame["sort_col"])

    # compare 10 more columns with answer Frame.  Compare all columns will take too long
    allColumns = list(range(0, df.ncols))
    random.shuffle(allColumns)
    pyunit_utils.compare_frames_local(answerFrame[allColumns[0:5]],
                                      newFrame[allColumns[0:5]])
def checkCorrectSkips(originalFullFrame, csvfile, skipped_columns, uuidNames):
    skippedFrameUF = h2o.upload_file(csvfile, skipped_columns=skipped_columns)
    skippedFrameIF = h2o.import_file(csvfile, skipped_columns=skipped_columns)  # this two frames should be the same
    pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=0.5)

    skipCounter = 0
    typeDict = originalFullFrame.types
    frameNames = originalFullFrame.names
    for cindex in range(len(frameNames)):
        if cindex not in skipped_columns:
            if typeDict[frameNames[cindex]] == u'enum':
                pyunit_utils.compare_frames_local_onecolumn_NA_enum(originalFullFrame[cindex],
                                                                    skippedFrameIF[skipCounter], prob=1, tol=1e-10,
                                                                    returnResult=False)
            elif typeDict[frameNames[cindex]] == u'string':
                pyunit_utils.compare_frames_local_onecolumn_NA_string(originalFullFrame[cindex],
                                                                      skippedFrameIF[skipCounter], prob=1,
                                                                      returnResult=False)
            else:
                pyunit_utils.compare_frames_local_onecolumn_NA(originalFullFrame[cindex], skippedFrameIF[skipCounter],
                                                               prob=1, tol=1e-10, returnResult=False)
            skipCounter = skipCounter + 1

    # since we cannot check uuid contents, we at least need to know that the return frame contains the correct column names
    frameNames.extend(uuidNames)
    skippedFrameNames = skippedFrameIF.names

    for skipIndex in skipped_columns:
        assert frameNames[skipIndex] not in skippedFrameNames, \
            "This column: {0}/{1} should have been skipped but is not!".format(frameNames[skipIndex], skipIndex)
예제 #22
0
def glm_ordinal_mojo_pojo():
    h2o.remove_all()
    params = set_params()  # set deeplearning model parameters
    df = random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        glmOrdinalModel = build_save_model(
            params, x, train, "response")  # build and save mojo model
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            glmOrdinalModel, TMPDIR,
            MOJONAME)  # load model and perform predict
        h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
        pred_pojo = pyunit_utils.pojo_predict(glmOrdinalModel, TMPDIR,
                                              MOJONAME)
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_frames_local(
            pred_h2o, pred_mojo, 0.1, tol=1e-10
        )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
        print("Comparing pojo predict and h2o predict...")
        pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(
                type(ex)
        ):  # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def link_functions_tweedie_vpow():
    np.random.seed(1234)
    n_rows = 10

    data = {
        "X1": np.random.randn(n_rows),
        "X2": np.random.randn(n_rows),
        "X3": np.random.randn(n_rows),
        "W": np.random.choice([10, 20], size=n_rows),
        "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows)
    }

    train = h2o.H2OFrame(pd.DataFrame(data))
    test = train.drop("W")
    print(train)
    h2o_model = H2OGeneralizedAdditiveEstimator(family="tweedie",
                                                gam_columns=["X3"],
                                                weights_column="W",
                                                lambda_=0,
                                                tweedie_variance_power=1.5,
                                                bs=[2],
                                                tweedie_link_power=0)
    h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train)

    predict_w = h2o_model.predict(train)
    predict = h2o_model.predict(test) # scoring without weight column
    # should produce same frame
    pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6)
def buildModelCheckPredict(train_data, test_data, model_test_data, myy, gamX,
                           family):
    numKnots = [5, 5, 5]
    x = ["C1", "C2"]

    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=gamX,
                                                scale=[1, 1, 1],
                                                num_knots=numKnots,
                                                standardize=True,
                                                Lambda=[0],
                                                alpha=[0],
                                                max_iterations=3,
                                                compute_p_values=False,
                                                solver="irlsm")
    h2o_model.train(x=x, y=myy, training_frame=train_data)
    pred = h2o_model.predict(test_data)
    if pred.ncols < model_test_data.ncols:
        ncolT = model_test_data.ncols - 1
        model_test_data = model_test_data.drop(ncolT)
    if (family == 'gaussian'):
        pyunit_utils.compare_frames_local(pred, model_test_data, prob=1)
    else:
        pred = pred.drop('predict')
        model_test_data = model_test_data.drop('predict')
        pyunit_utils.compare_frames_local(pred, model_test_data, prob=1)
예제 #25
0
def test_maxrglm_cross_validation_result_frame_model_id():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3

    maxrglm_model = maxrglm(seed=12345,
                            max_predictor_number=3,
                            nfolds=n_folds,
                            fold_assignment="auto")
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    resultFrame = maxrglm_model.result()
    numRows = resultFrame.nrows
    modelIDs = maxrglm_model._model_json["output"]["best_model_ids"]
    for ind in list(range(numRows)):
        model_frame = h2o.get_model(resultFrame["model_id"][ind, 0])
        pred_frame = model_frame.predict(d)
        model_id = h2o.get_model(modelIDs[ind]['name'])
        pred_id = model_id.predict(d)
        pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)
예제 #26
0
def mergeOneEmptyFrame():
    # PUBDEV-6987: merge with one empty frame and one normal frame.
    file1 = h2o.H2OFrame({"A1": [1], "A2": [0]})
    file2 = h2o.H2OFrame({"A1": [], "A2": []})
    # all_x = all_y = False, only merge rows that appear both it the right and left frames
    f1Mergef2 = file1.merge(file2)  # right frame is empty, stall here
    f2Mergef1 = file2.merge(
        file1)  # left frame is empty, should return empty frame
    f2Mergef2 = file2.merge(file2)  # merging of empty frame with just headers

    # all three frames should have zero number of rows
    assert f1Mergef2.nrows == 0, "Expected empty rows but actual number of row is {0}!".format(
        f1Mergef2.nrows)
    assert f2Mergef1.nrows == 0, "Expected empty rows but actual number of row is {0}!".format(
        f2Mergef1.nrows)
    assert f2Mergef2.nrows == 0, "Expected empty rows but actual number of row is {0}!".format(
        f2Mergef2.nrows)

    f1Mergef2 = file1.merge(
        file2,
        all_x=True)  # should contain content of file1, merge everything in f1
    f2Mergef1 = file2.merge(
        file1,
        all_y=True)  # should contain content of file1, merge everything in f2

    assert f1Mergef2.nrow == 1, "Expected one row  but actual number of row is {0}!".format(
        f1Mergef2.nrows)
    assert f2Mergef1.nrow == 1, "Expected one row  but actual number of row is {0}!".format(
        f2Mergef1.nrows)
    pyunit_utils.compare_frames_local(f1Mergef2, file1, prob=1)
    pyunit_utils.compare_frames_local(f2Mergef1, file1, prob=1)
def test_gam_transformed_frame_serialization():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gam_frame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "gamXFrame.csv")
    h2o.download_csv(gam_frame, filename)
    model_path = h2o.save_model(h2o_model, tmpdir)

    h2o.remove_all()
    loaded_model = h2o.load_model(model_path)
    gam_frame_loaded = h2o.get_frame(
        loaded_model._model_json["output"]["gam_transformed_center_key"])
    gam_frame_original = h2o.import_file(filename)
    pyunit_utils.compare_frames_local(gam_frame_loaded[2:15],
                                      gam_frame_original[2:15],
                                      prob=1,
                                      tol=1e-6)
    print("Test completed.")
예제 #28
0
def test_glrm_seeds():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()
    initMethods = [
        "random", "svd", "plus_plus", "user"
    ]  # user mode without init values is equivalent to randomized
    seeds = [123456789, 987654321]

    for initM in initMethods:
        # first two models are trained with same seed and should be the same
        glrm_h2o_seed0 = setupTrainModel(initM, seeds[0])
        predict_seed0 = predGLRM(irisH2O, glrm_h2o_seed0)

        glrm_h2o_seed0Same = setupTrainModel(initM, seeds[0])
        predict_seed0Same = predGLRM(irisH2O, glrm_h2o_seed0Same)

        # trained with same seed, reconstructed datasets should be the same
        pyunit_utils.compare_frames_local(
            predict_seed0[0:4], predict_seed0Same[0:4], prob=1.0
        )  # compare and make sure reconstructed frames are the same

        # trained with different seed, reconstructed datasets should be different
        glrm_h2o_seed1 = setupTrainModel(initM, seeds[1])
        predict_seed1 = predGLRM(irisH2O, glrm_h2o_seed1)
        assert not (pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed1[0:4], prob=1.0, returnResult=True)), \
          "GLRM return same results with different random seed."
def glm_gamma_offset_mojo():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = "DPROS"
    x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"]
    x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"]
    params = {'family': "gamma", 'offset_column': "C1"}
    offset = pyunit_utils.random_dataset_real_only(train.nrow,
                                                   1,
                                                   realR=3,
                                                   misFrac=0,
                                                   randSeed=12345)
    train = train.cbind(offset)

    tmpdir = tempfile.mkdtemp()
    glm_gamma_model = pyunit_utils.build_save_model_generic(
        params, x, train, y, "glm", tmpdir)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id)

    h2o.download_csv(train[x_offset], os.path.join(
        tmpdir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glm_gamma_model, tmpdir, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10)  # compare mojo and model predict
def gam_gaussian_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    PROBLEM="gaussian"
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001)   # generate random dataset
    dfnames = df.names
    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3    # maximum number of gam columns
    for cname in dfnames:
        if not(cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count+1
            if (count >= num_gam_cols):
                break
    
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    TMPDIR = tempfile.mkdtemp()
    gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id)
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression")       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5])
    m.train(training_frame=d, x=myX, y=myY)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Should have throw exception of bad coefficient length"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 9 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
def test_glrm_seeds():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()
  initMethods = ["random", "svd", "plus_plus", "user"] # user mode without init values is equivalent to randomized
  seeds = [123456789, 987654321]


  for initM in initMethods:
    # first two models are trained with same seed and should be the same
    glrm_h2o_seed0 = setupTrainModel(initM, seeds[0])
    predict_seed0 = predGLRM(irisH2O, glrm_h2o_seed0)

    glrm_h2o_seed0Same = setupTrainModel(initM, seeds[0])
    predict_seed0Same = predGLRM(irisH2O, glrm_h2o_seed0Same)

    # trained with same seed, reconstructed datasets should be the same
    pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed0Same[0:4],
                                      prob=1.0)  # compare and make sure reconstructed frames are the same

    # trained with different seed, reconstructed datasets should be different
    glrm_h2o_seed1 = setupTrainModel(initM, seeds[1])
    predict_seed1 = predGLRM(irisH2O, glrm_h2o_seed1)
    assert not (pyunit_utils.compare_frames_local(predict_seed0[0:4], predict_seed1[0:4], prob=1.0, returnResult=True)), \
      "GLRM return same results with different random seed."
def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
def test_parquet_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquetNoSkip = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet"))
    pyunit_utils.compare_frames_local(csv, parquetNoSkip, prob=1)  # should be the same here.

    path = pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol//2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def glm_multinomial_mojo_pojo():
    PROBLEM="multinomial"
    NTESTROWS=200
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def pubdev_3567():
    train = h2o.import_file(pyunit_utils.locate("smalldata/jira/frameA2.csv"), header=1)
    test = h2o.import_file(pyunit_utils.locate("smalldata/jira/frameB2.csv"), header=1)
    mergedAns = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2.csv"), header=1)
    mergedAnsLeft = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2Left.csv"), header=1)
    mergedAnsRight = h2o.import_file(pyunit_utils.locate("smalldata/jira/merged2Right.csv"), header=1)
    merged = train.merge(test,by_x=["A"],by_y=["A"],method="auto") # default is radix
    print(merged[0,0])
    mergedLeft = train.merge(test,by_x=["A"],by_y=["A"],all_x=True)
    print(mergedLeft[0,0])
    mergedRight = train.merge(test,by_x=["A"],by_y=["A"],all_y=True)    # new feature
    print(mergedRight[0,0])

    pyunit_utils.compare_frames_local(mergedAnsRight, mergedRight, 1, tol=1e-10)
    pyunit_utils.compare_frames_local(mergedAns, merged, 1, tol=1e-10)
    pyunit_utils.compare_frames_local(mergedAnsLeft, mergedLeft, 1, tol=1e-10)
def checkCorrectSkips(csvfile, originalFrame):
  skippedFrameUF = h2o.upload_file(csvfile)
  skippedFrameIF = h2o.import_file(csvfile) # this two frames should be the same
  pyunit_utils.compare_frames_local(skippedFrameUF, skippedFrameIF, prob=1)

  # test with null skipped_column list
  skippedFrameUF2 = h2o.upload_file(csvfile, skipped_columns=[])
  skippedFrameIF2 = h2o.import_file(csvfile, skipped_columns=[]) # this two frames should be the same
  pyunit_utils.compare_frames_local(skippedFrameUF2, skippedFrameIF2, prob=1)

  # frame from not skipped_columns specification and empty skipped_columns should return same result
  pyunit_utils.compare_frames_local(skippedFrameUF2, skippedFrameIF, prob=1)

  # compare skipped frame with originalFrame
  assert originalFrame.ncol==skippedFrameUF.ncol, \
    "Expected return frame column number: {0}, actual frame column number: " \
    "{1}".format((originalFrame.ncol, skippedFrameUF.ncol))
  pyunit_utils.compare_frames_local_svm(originalFrame, skippedFrameIF2, prob=1)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def random_seeds_test():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        # train H2O XGBoost first
        higgs_h2o_train = h2o.import_file(pyunit_utils.locate('bigdata/laptop/higgs_train_imbalance_100k.csv'))
        higgs_h2o_train[0] = higgs_h2o_train[0].asfactor()
        higgs_h2o_test = h2o.import_file(pyunit_utils.locate('bigdata/laptop/higgs_test_imbalance_100k.csv'))
        higgs_h2o_test[0] = higgs_h2o_test[0].asfactor()
        myX = list(higgs_h2o_train.names)
        y = "response"
        myX.remove(y)
        # run with old same random seed
        h2oParams = {"ntrees":10, "max_depth":4, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": 100, "seed":-12345}
        print("Model 1 trainged with old seed {0}.".format(h2oParams['seed']))
        # train model 1 with same seed from previous runs
        h2oModel1 = H2OXGBoostEstimator(**h2oParams)
        # gather, print and save performance numbers for h2o model
        h2oModel1.train(x=myX, y=y, training_frame=higgs_h2o_train)
        h2oPredict1 = h2oModel1.predict(higgs_h2o_test)

        h2oModel1_2 = H2OXGBoostEstimator(**h2oParams)
        # gather, print and save performance numbers for h2o model
        h2oModel1_2.train(x=myX, y=y, training_frame=higgs_h2o_train)
        h2oPredict1_2 = h2oModel1_2.predict(higgs_h2o_test)
        # run with new random seed
        seed2 = random.randint(1, 1073741824) # seed cannot be long, must be int size
        h2oParams2 = {"ntrees":100, "max_depth":10, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                      "min_rows" : 5, "score_tree_interval": 100, "seed":seed2}
        print("Model 2 trainged with new seed {0}.".format(h2oParams2['seed']))
        h2oModel2 = H2OXGBoostEstimator(**h2oParams2)
        # gather, print and save performance numbers for h2o model
        h2oModel2.train(x=myX, y=y, training_frame=higgs_h2o_train)
        h2oPredict2 = h2oModel2.predict(higgs_h2o_test)

        # Result comparison in terms of prediction output.  In theory, h2oModel1 should be the same as saved run
        # compare the logloss
        assert abs(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"]-
                   h2oModel1_2._model_json["output"]["training_metrics"]._metric_json["logloss"])<1e-10, \
            "Model outputs should be the same with same seeds but are not!  Expected: {0}, actual: " \
            "{1}".format(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"],
                         h2oModel1_2._model_json["output"]["training_metrics"]._metric_json["logloss"])
        assert abs(h2oModel1._model_json["output"]["training_metrics"]._metric_json["logloss"]-
                        h2oModel2._model_json["output"]["training_metrics"]._metric_json["logloss"])>1e-10, \
            "Model outputs should be different with same seeds but are not!"

        # compare some prediction probabilities
        model1Pred = [h2oPredict1[0,"p1"], h2oPredict1[1,"p1"], h2oPredict1[2,"p1"], h2oPredict1[3,"p1"]]
        model1_2Pred = [h2oPredict1_2[0,"p1"], h2oPredict1_2[1,"p1"], h2oPredict1_2[2,"p1"], h2oPredict1_2[3,"p1"]]
        assert model1Pred==model1_2Pred, "Model 1 should have same predictions as previous with same seed but do not."
        try:
            pyunit_utils.compare_frames_local(h2oPredict1[['p0', 'p1']], h2oPredict2[['p0', 'p1']], prob=0.1, tol=1e-6) # should fail
            assert False, "Predict frames from two different seeds should be different but is not.  FAIL!"
        except:
            assert True

        # train multiple native XGBoost
        nativeTrain = genDMatrix(higgs_h2o_train, myX, y)
        nativeTest = genDMatrix(higgs_h2o_test, myX, y)
        h2o.remove_all()
        nativeParam = {'eta': h2oParams["learn_rate"], 'objective': 'binary:logistic', 'booster': 'gbtree',
                       'max_depth': h2oParams["max_depth"], 'seed': h2oParams["seed"],
                       'min_child_weight':h2oParams["min_rows"],
                       'colsample_bytree':h2oParams["col_sample_rate_per_tree"],'alpha':0.0, 'nrounds':h2oParams["ntrees"]}
        nativeModel1 = xgb.train(params=nativeParam,
                                dtrain=nativeTrain)
        nativePred1 = nativeModel1.predict(data=nativeTest)
        nativeModel1_2 = xgb.train(params=nativeParam,
                                 dtrain=nativeTrain)
        nativePred1_2 = nativeModel1_2.predict(data=nativeTest)

        nativeParam2 = {'eta': h2oParams["learn_rate"], 'objective': 'binary:logistic', 'booster': 'gbtree',
                       'max_depth': h2oParams["max_depth"], 'seed': h2oParams2["seed"],
                        'min_child_weight':h2oParams["min_rows"],
                        'colsample_bytree':h2oParams["col_sample_rate_per_tree"],'alpha':0.0, 'nrounds':h2oParams["ntrees"]}

        nativeModel2 = xgb.train(params=nativeParam2,
                             dtrain=nativeTrain ,
                             num_boost_round=h2oParams["ntrees"])
        nativePred2 = nativeModel2.predict(data=nativeTest)

        # nativeModel1 and nativeModel2 should generate the same results while nativeModel3 should provide different results
        # compare prediction probability and they should agree if they use the same seed
        nativePreds1_2 = [nativePred1_2[0], nativePred1_2[1], nativePred1_2[2], nativePred1_2[3]]
        nativePreds1 = [nativePred1[0], nativePred1[1], nativePred1[2], nativePred1[3]]
    #    plot_tree(nativeModel1,num_trees=4)
    #    plt.show()
        for ind in range(len(nativePreds1)):
            assert abs(nativePreds1_2[ind]-nativePreds1[ind])<1e-7, "Native XGBoost Model 1 should have same predictions" \
                                                                    " as previous with same seed but do not."
        for ind in range(4):
            assert abs(nativePred1[ind]-nativePred2[ind])>=1e-6, \
                "Native XGBoost model 1 prediction prob: {0} and native XGBoost model 3 prediction prob: {1}.  " \
                "They are too similar.".format(nativePred1[ind], nativePred2[ind])
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)