def test_deprecated_noise_level_param_is_alias_for_noise():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    # print(encoded)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        te_nl = H2OTargetEncoderEstimator(noise_level=0)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "``noise_level`` param of ``{}`` is deprecated".format(
            te_init_name) in str(w[0].message)

    te_nl.train(y=ds.target, training_frame=ds.train)
    encoded_nl = te_nl.predict(ds.test)
    # print(encoded_nl)
    te_n = H2OTargetEncoderEstimator(noise=0)
    te_n.train(y=ds.target, training_frame=ds.train)
    encoded_n = te_n.predict(ds.test)
    # print(encoded_n)
    try:
        pu.compare_frames(encoded_nl, encoded, 0, tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
    assert pu.compare_frames(encoded_nl, encoded_n, 0, tol_numeric=1e-5)
Exemplo n.º 2
0
def test_transform_can_override_blending_parameters():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    transformed = te.transform(ds.test)
    transformed_blending = te.transform(ds.test, blending=True)
    try:
        assert pu.compare_frames(transformed,
                                 transformed_blending,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)

    transformed_blending_custom = te.transform(ds.test,
                                               blending=True,
                                               inflection_point=3,
                                               smoothing=17)
    try:
        assert pu.compare_frames(transformed_blending_custom,
                                 transformed_blending,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
def gbm_on_hive():
    connection_url = "jdbc:hive2://localhost:10000/default"
    krb_enabled = os.getenv('KRB_ENABLED', 'false')
    if krb_enabled.lower() == 'true':
        connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]')

    select_query = "select * from airlinestest"
    username = "******"
    password = ""
    airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip")
    airlines_dataset = h2o.import_sql_select(connection_url, select_query, username, password)
    pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset, 100, tol_numeric=0)
    airlines_dataset["table_for_h2o_import.origin"] = airlines_dataset["table_for_h2o_import.origin"].asfactor()
    airlines_dataset["table_for_h2o_import.fdayofweek"] = airlines_dataset["table_for_h2o_import.fdayofweek"].asfactor()
    airlines_dataset["table_for_h2o_import.uniquecarrier"] = airlines_dataset["table_for_h2o_import.uniquecarrier"].asfactor()
    airlines_dataset["table_for_h2o_import.dest"] = airlines_dataset["table_for_h2o_import.dest"].asfactor()
    airlines_dataset["table_for_h2o_import.fyear"] = airlines_dataset["table_for_h2o_import.fyear"].asfactor()
    airlines_dataset["table_for_h2o_import.fdayofmonth"] = airlines_dataset["table_for_h2o_import.fdayofmonth"].asfactor()
    airlines_dataset["table_for_h2o_import.isdepdelayed"] = airlines_dataset["table_for_h2o_import.isdepdelayed"].asfactor()
    airlines_dataset["table_for_h2o_import.fmonth"] = airlines_dataset["table_for_h2o_import.fmonth"].asfactor()
    airlines_X_col_names = airlines_dataset.col_names[:-2]
    airlines_y_col_name = airlines_dataset.col_names[-2]
    train, valid, test = airlines_dataset.split_frame([0.6, 0.2], seed=1234)
    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000)
    gbm_v1.train(airlines_X_col_names, airlines_y_col_name, training_frame=train, validation_frame=valid)
    gbm_v1.predict(test)
def h2o_H2OFrame_rep_len():
    """
    Python API test: h2o.frame.H2OFrame.rep_len(length_out)
    """
    row_num = randrange(1, 10)
    col_num = randrange(1, 10)
    length_out_r = math.ceil(0.78 * row_num)
    python_lists = np.random.randint(-5, 5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)

    one_column = h2oframe[0].rep_len(
        length_out=(length_out_r + row_num))  # one column, duplicate row
    assert_is_type(one_column, H2OFrame)  # check return type
    # check shape
    assert one_column.shape == (
        length_out_r + row_num,
        1), "h2o.H2OFrame.rep_len() command is not working."

    # check values
    repeat_row_start = row_num
    repeat_row_end = row_num + length_out_r
    pyunit_utils.compare_frames(h2oframe[0:length_out_r, 0],
                                one_column[repeat_row_start:repeat_row_end, 0],
                                length_out_r,
                                tol_time=0,
                                tol_numeric=1e-6,
                                strict=False,
                                compare_NA=True)
Exemplo n.º 5
0
def h2o_H2OFrame_top_bottomN():
    """
    PUBDEV-3624 Top or Bottom N test h2o.frame.H2OFrame.topN() and h2o.frame.H2OFrame.bottomN() functions.
    Given a H2O frame, a column index or column name, a double denoting percentages of top/bottom rows to 
    return, the topN will return a H2OFrame containing two columns, one will
    be the topN (or bottomN) values of the specified column.  The other column will record the row indices into
    the original frame of where the topN (bottomN) values come from.  This will let the users to grab those
    corresponding rows to do whatever they want with it.
    """
    dataFrame = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/jira/TopBottomNRep4.csv.zip"))
    topAnswer = h2o.import_file(
        pyunit_utils.locate("smalldata/jira/Top20Per.csv.zip"))
    bottomAnswer = h2o.import_file(
        pyunit_utils.locate("smalldata/jira/Bottom20Per.csv.zip"))
    nPercentages = [1, 2, 3,
                    4]  # multiples of 4 since dataset is repeated 4 times.
    frameNames = dataFrame.names  # get data column names
    tolerance = 1e-12
    nsample = 100
    nP = nPercentages[randint(0,
                              len(nPercentages) -
                              1)]  # pick a random percentage
    colIndex = randint(0, len(frameNames) - 1)  # pick a random column

    if (randint(0, 2) == 0):
        print(
            "For topN: Percentage chosen is {0}.  Column index chosen is {1}".
            format(nP, colIndex))
        newTopFrame = dataFrame.topN(frameNames[colIndex],
                                     nP)  # call topN with column names
        newTopFrameC = dataFrame.topN(colIndex,
                                      nP)  # call topN with same column index

        # the two return frames should be the same for this case, compare 1000 rows chosen randomly
        pyunit_utils.compare_frames(newTopFrame,
                                    newTopFrameC,
                                    nsample,
                                    tol_numeric=tolerance)

        # compare one of the return frames with known answer
        compare_rep_frames(topAnswer, newTopFrame, tolerance, colIndex, 0)
    else:
        # test bottomN here
        print(
            "For bottomN: Percentage chosen is {0}.  Column index chosen is {1}"
            .format(nP, colIndex))
        newBottomFrame = dataFrame.bottomN(frameNames[colIndex],
                                           nP)  # call topN with column names
        newBottomFrameC = dataFrame.bottomN(
            colIndex, nP)  # call topN with same column index

        # the two return frames should be the same for this case
        pyunit_utils.compare_frames(newBottomFrame,
                                    newBottomFrameC,
                                    nsample,
                                    tol_numeric=tolerance)
        # compare one of the return frames with known answer
        compare_rep_frames(bottomAnswer, newBottomFrame, tolerance, colIndex,
                           1)
def test_deprecated_k_param_is_alias_for_inflection_point():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    # print(encoded)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        te_k = H2OTargetEncoderEstimator(noise=0, k=5, blending=True)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "``k`` param of ``{}`` is deprecated".format(
            te_init_name) in str(w[0].message)

    te_k.train(y=ds.target, training_frame=ds.train)
    encoded_k = te_k.predict(ds.test)
    # print(encoded_k)
    te_ip = H2OTargetEncoderEstimator(noise=0,
                                      inflection_point=5,
                                      blending=True)
    te_ip.train(y=ds.target, training_frame=ds.train)
    encoded_ip = te_ip.predict(ds.test)
    # print(encoded_ip)
    try:
        pu.compare_frames(encoded_k, encoded, 0, tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
    assert pu.compare_frames(encoded_k, encoded_ip, 0, tol_numeric=1e-5)
def test_target_encoding_transform_none_blending():
    print("Check none strategy with and without blending")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    
    targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                              blended_avg= True, inflection_point = 3, smoothing = 1)
    
    targetEncoderWithBlending.fit(frame=trainingFrame)

    encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)

    frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded]

    targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                                 blended_avg= False, inflection_point = 3, smoothing = 1)

    targetEncoderWithoutBlending.fit(frame=trainingFrame)

    encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)
    encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded]

    try:
        pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Hopefully because of the blending.')
Exemplo n.º 8
0
def gbm_on_hive():
    connection_url = "jdbc:hive2://localhost:10000/default"
    krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true'
    use_token = os.getenv('KRB_USE_TOKEN', 'false').lower() == 'true'
    if krb_enabled:
        if use_token:
            connection_url += ";auth=delegationToken"
        else:
            connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]')

    select_query = "select * from airlinestest"
    username = "******"
    password = ""

    # read from S3
    airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip")
    # read from Hive Streaming
    airlines_dataset_streaming = h2o.import_sql_select(connection_url, select_query, username, password, fetch_mode="SINGLE")
    airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming)

    # datasets should be identical from user's point of view
    pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset_streaming, 100, tol_numeric=0)

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    airlines_X_col_names = airlines_dataset_streaming.col_names[:-2]
    airlines_y_col_name = airlines_dataset_streaming.col_names[-2]
    gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000)
    gbm_v1.train(airlines_X_col_names, airlines_y_col_name,
                 training_frame=airlines_dataset_streaming, validation_frame=airlines_dataset_streaming)
    print(gbm_v1)
    # demonstrates that metrics can be slightly different due to different chunking on the backend
    assert isclose(gbm_v1.auc(train=True), gbm_v1.auc(valid=True), rtol=1e-4)
Exemplo n.º 9
0
def test_transform_can_be_applied_to_training_frame_with_special_flag():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    transformed_as_training = te.transform(ds.train, as_training=True)
    transformed = te.transform(ds.train)

    assert pu.compare_frames(transformed,
                             transformed_as_training,
                             0,
                             tol_numeric=1e-5)

    # now with non default params
    te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                      blending=True,
                                      inflection_point=5,
                                      smoothing=17,
                                      seed=seed,
                                      noise=0.01)
    te_nd.train(y=ds.target, training_frame=ds.train)
    transformed_as_training = te_nd.transform(ds.train, as_training=True)
    transformed = te_nd.transform(ds.train)
    try:
        assert pu.compare_frames(transformed,
                                 transformed_as_training,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
Exemplo n.º 10
0
def sortOrMerge():
    # PUBDEV-5266 sort/merge with string columns but not on string columns
    # test either the merge or the sort part
    name1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
    name2 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f2.csv"
    c1names = ["stringf1-1", "stringf1-2", "int1", "intf1-1"]
    c2names = ["stringf2-1","intf2-1", "iintf2-2", "stringf2-2","intf2-3",  "stringf2-3", "stringf2-4",  "int1"]
    f1names = [name1, name1, name1]
    f2names = [name2, name2, name2]
    ansNames = ["bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/sortedF1_R_C3_C4.csv",
                "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/mergedf1_f2unique.csv",
                "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/mergedf1_f2unique_x_T.csv"]
    xvals = [False,False,True]
    yvals = [False,False,False]
    f1colnames = [c1names, c1names, c1names]
    f2colnames = [c2names, c2names, c2names]
    numTests = len(xvals)-1
    runIndex = random.randint(0,numTests)

    if runIndex==0: # perform sorting first
        f1 = h2o.import_file(pyunit_utils.locate(f1names[runIndex]))
        sorted_column_indices = [2, 3]
        h2oSortf1 = f1.sort(sorted_column_indices)
        coltypes = getTypes(h2oSortf1)
        f1sortedR = h2o.import_file(pyunit_utils.locate(ansNames[runIndex]), col_types=coltypes, header=1)
        assert pyunit_utils.compare_frames(f1sortedR, h2oSortf1, 100, tol_numeric=0)
    else:   # test merging here
        f1 = h2o.import_file(pyunit_utils.locate(f1names[runIndex]),header=1)
        f1.set_names(f1colnames[runIndex])
        f2 = h2o.import_file(pyunit_utils.locate(f2names[runIndex]),header=1)
        f2.set_names(f2colnames[runIndex])
        mergedh2o = f1.merge(f2,all_x=xvals[runIndex],all_y=yvals[runIndex], method='auto')
        coltypes = getTypes(mergedh2o)
        f1mergedf2 = h2o.import_file(pyunit_utils.locate(ansNames[runIndex]), col_types=coltypes, header=1)
        assert pyunit_utils.compare_frames(f1mergedf2, mergedh2o, 100, tol_numeric=0)
def test_target_encoding_transform_none_blending():
    print("Check none strategy with and without blending")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    
    targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                              blended_avg= True, inflection_point = 3, smoothing = 1)
    
    targetEncoderWithBlending.fit(frame=trainingFrame)

    encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)

    frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded]

    targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                                 blended_avg= False, inflection_point = 3, smoothing = 1)

    targetEncoderWithoutBlending.fit(frame=trainingFrame)

    encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)
    encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded]

    try:
        pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Hopefully because of the blending.')
def sort():
    try:
        df = h2o.H2OFrame({"A":["another", "set", "of", "bad", "string"], "B":[10, 1, 2, 5, 7],
                           "C":["what", "is", "this", "thing", "doing"]})
        dfIntSorted = h2o.H2OFrame({"B":[1,2,5,7,10]})
        dfSortedIntCN = df.sort("B")
        pyunit_utils.compare_frames(dfIntSorted, dfSortedIntCN, df.nrow)
        assert False, "Sort could not work with String columns and an error should have been thrown but not..."
    except:
        assert True # expected error here as sort will not work with String columns in the frame
def sort():
    try:
        df = h2o.H2OFrame({
            "A": ["another", "set", "of", "bad", "string"],
            "B": [10, 1, 2, 5, 7],
            "C": ["what", "is", "this", "thing", "doing"]
        })
        dfIntSorted = h2o.H2OFrame({"B": [1, 2, 5, 7, 10]})
        dfSortedIntCN = df.sort("B")
        pyunit_utils.compare_frames(dfIntSorted, dfSortedIntCN, df.nrow)
        assert False, "Sort could not work with String columns and an error should have been thrown but not..."
    except:
        assert True  # expected error here as sort will not work with String columns in the frame
Exemplo n.º 14
0
def tf_idf_small_data(preprocess, case_sens, cols=None):
    if cols is None:
        cols = [0, 1]
    input_fr = get_simple_input_test_frame(
    ) if preprocess else get_simple_preprocessed_input_test_frame()
    expected_fr = get_expected_output_frame_case_sens(
    ) if case_sens else get_expected_output_frame_case_insens()
    out_frame = tf_idf(input_fr, cols[0], cols[1], preprocess, case_sens)
    pyunit_utils.compare_frames(expected_fr,
                                out_frame,
                                len(out_frame),
                                tol_numeric=1e-5,
                                compare_NA=False)
Exemplo n.º 15
0
def parquet_parse_dates():
    parquet_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/parquet/parquet-file-with-date-column.snappy.parquet"
    ))

    parquet_data.summary()
    parquet_summary = h2o.frame(parquet_data.frame_id)["frames"][0]["columns"]
    date_converted_column_type = parquet_summary[2]['type']
    assert date_converted_column_type == "time"

    date_string_rows = parquet_data[:, "date_string"]
    date_converted_rows = parquet_data[:, "date_converted"]
    pyunit_utils.compare_frames(date_string_rows, date_converted_rows, 1)
Exemplo n.º 16
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200              # comparing in ms or ns
        tol_numeric = 1e-5          # tolerance for comparing other numeric fields
        numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

        allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                       "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                       "/datasets/orc_parser/orc/orc_split_elim.orc"]

        allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                       "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                       "/datasets/orc_parser/csv/orc_split_elim.csv"]

        for fIndex in range(len(allOrcFiles)):
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex])
            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def orc_parser_timestamp_date():
    """
    This test will parse orc files containing timestamp and date information into
    H2O frame.  Next, it will take the .csv file generated from the orc file from
    Hive and parse into H2O frame.  Finally, we compare the two frames and make sure
    that they are equal.

    We want to make sure that we are parsing the date and timestamp
    date correctly from an orc file.  Thanks to Nidhi who has imported an orc file
    containing timestamp/date into spark and later into Hive and write it out as
    csv.

    :return: None
    """

    tol_time = 200              # comparing in ms or ns
    tol_numeric = 1e-5          # tolerance for comparing other numeric fields
    numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

    allOrcFiles = ["smalldata/parser/orc/TestOrcFile.testDate1900.orc",
                   "smalldata/parser/orc/TestOrcFile.testDate2038.orc",
                   "smalldata/parser/orc/orc_split_elim.orc"]

    allCsvFiles = ["smalldata/parser/orc/orc2csv/TestOrcFile.testDate1900.csv",
                   "smalldata/parser/orc/orc2csv/TestOrcFile.testDate2038.csv",
                   "smalldata/parser/orc/orc2csv/orc_split_elim.csv"]

    for fIndex in range(len(allOrcFiles)):

        h2oOrc = h2o.import_file(path=pyunit_utils.locate(allOrcFiles[fIndex]))
        h2oCsv = h2o.import_file(path=pyunit_utils.locate(allCsvFiles[fIndex]))

        # compare the two frames
        assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
            "H2O frame parsed from orc and csv files are different!"
Exemplo n.º 18
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200  # comparing in ms or ns
        tol_numeric = 1e-5  # tolerance for comparing other numeric fields
        numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

        hdfs_csv_file = "/datasets/orc_parser/synthetic_perfect_separation_csv"
        hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        multi_file_csv = h2o.import_file(url_csv)
        multi_file_orc = h2o.import_file(url_orc)

        # make sure orc multi-file and single big file create same H2O frame
        assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv, numElements2Compare, tol_time,
                                           tol_numeric,True), "H2O frame parsed from multiple orc and single orc " \
                                                              "files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200              # comparing in ms or ns
        tol_numeric = 1e-5          # tolerance for comparing other numeric fields
        numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

        hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
        hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        h2oOrc = h2o.import_file(url_orc)
        h2oCsv = h2o.import_file(url_csv)

        # compare the two frames
        assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
            "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemplo n.º 20
0
def import_folder():

    tol_time = 200  # comparing in ms or ns for timestamp columns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/hexdev_497/airlines_first_header"))
    multi_file_gzip_comp = \
      h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        numElements2Compare = 100
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        h2oframe_csv = h2o.import_file(url_csv)
        data_types = ['real', 'real', 'real', 'real', 'enum']
        h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

        # compare the two frames
        assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                           True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemplo n.º 22
0
def import_folder():

    tol_time = 200  # comparing in ms or ns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

    # compressed the whole directory of files.
    multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

    # directory containing the gzip version of csv files here.
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Exemplo n.º 23
0
def titanic():
    df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"),
                       col_types={'pclass': "enum", 'survived': "enum"})
    x =  ["age", "sibsp", "parch", "fare", "sex", "pclass"]

    # Split the dataset into train and test
    train, test = df.split_frame(ratios=[.8], seed=1234)

    rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules")
    rfit.train(training_frame=train, x=x, y="survived", validation_frame=test)

    assert rfit.rmse(valid=True) is not None, "validation metrics should be present"


    print(rfit.rule_importance())
    assert rfit._model_json["output"]["model_summary"] is not None, "model_summary should be present"
    assert len(rfit._model_json["output"]["model_summary"]._cell_values) > 0, "model_summary's content should be present"

    rfit_predictions = rfit.predict(test)

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = rfit.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        # run a quick test to determine if the hive-exec is too old.

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py"))
            pass
        else:

            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

            hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py"))
            pass
        else:

            numElements2Compare = 100
            tol_time = 200
            tol_numeric = 1e-5

            hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oframe_csv = h2o.import_file(url_csv)
            data_types = ['real', 'real', 'real', 'real', 'enum']
            h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                               True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemplo n.º 26
0
def test_transform_seed_param_raise_warning():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(seed=42)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)

    transformed_1 = te.transform(ds.test)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        transformed_2 = te.transform(ds.test, seed=24)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "`seed` is deprecated in `transform` method and will be ignored" in str(
            w[0].message)

    assert pu.compare_frames(encoded, transformed_1, 0, tol_numeric=1e-5)
    assert pu.compare_frames(encoded, transformed_2, 0, tol_numeric=1e-5)
def import_folder():

  tol_time = 200              # comparing in ms or ns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

  multi_file_csv1 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/balunbal.csv"))
  multi_file_csv2 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/unbalbal.csv"))
  multi_file_orc = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_separation"))

  # make sure orc multi-file and single big file create same H2O frame
  try:
    assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time, tol_numeric,
                                       True), "H2O frame parsed from multiple orc and single orc files are different!"
  except:
    assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time, tol_numeric,
                                       True), "H2O frame parsed from multiple orc and single orc files are different!"
Exemplo n.º 28
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print(
                "Your hive-exec version is too old.  Orc parser test {0} is "
                "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
Exemplo n.º 29
0
def iris():
    df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"),
                         col_types={'species': "enum"})
    x = df.columns
    y = "species"
    x.remove(y)

    # Split the dataset into train and test
    train, test = df.split_frame(ratios=[.8], seed=1234)

    rfit = H2ORuleFitEstimator(min_rule_length=4,
                               max_rule_length=5,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules")
    rfit.train(training_frame=train, x=x, y=y, validation_frame=test)

    assert rfit.rmse(
        valid=True) is not None, "validation metrics should be present"

    print(rfit.rule_importance())
    assert rfit._model_json["output"][
        "model_summary"] is not None, "model_summary should be present"
    assert len(rfit._model_json["output"]["model_summary"]._cell_values
               ) > 0, "model_summary's content should be present"

    rfit_predictions = rfit.predict(test)

    frame = rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])
    assert frame.sum().getrow()[0] == 49.0

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = rfit.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)

    # test predict_rules also on linear variable input
    rfit = H2ORuleFitEstimator(min_rule_length=4,
                               max_rule_length=5,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules_and_linear")
    rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
    print(rfit.rule_importance())
    frame = rfit.predict_rules(
        train,
        ['linear.petal_len_Iris-setosa', 'linear.petal_wid_Iris-virginica'])
    assert frame.sum().getrow()[0] == train.nrows
Exemplo n.º 30
0
def test_transform_produces_the_same_result_as_predict_by_default():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    transformed = te.transform(ds.test)
    assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)

    # now with non default params
    te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                      blending=True,
                                      inflection_point=5,
                                      smoothing=17,
                                      seed=seed,
                                      noise=0.01)
    te_nd.train(y=ds.target, training_frame=ds.train)
    encoded = te_nd.predict(ds.test)
    transformed = te_nd.transform(ds.test)
    assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
Exemplo n.º 32
0
def hive_jdbc_import():
    connection_url = "jdbc:hive2://localhost:10000/default"
    krb_enabled = os.getenv('KRB_ENABLED', 'false')
    if krb_enabled.lower() == 'true':
        connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL',
                                                      'hive/[email protected]')

    hive_dist_enabled = os.getenv('HIVE_DIST_ENABLED',
                                  'true').lower() == 'true'

    select_query = "select * from airlinestest"
    username = "******"
    password = ""

    # read from S3
    airlines_dataset_original = h2o.import_file(
        path=
        "https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip"
    )

    # read from Hive Distributed
    if hive_dist_enabled:
        airlines_dataset_dist = h2o.import_sql_select(connection_url,
                                                      select_query, username,
                                                      password)
        airlines_dataset_dist = adapt_airlines(airlines_dataset_dist)
        pyunit_utils.compare_frames(airlines_dataset_original,
                                    airlines_dataset_dist,
                                    100,
                                    tol_numeric=0)

    # read from Hive Streaming
    airlines_dataset_streaming = h2o.import_sql_select(connection_url,
                                                       select_query,
                                                       username,
                                                       password,
                                                       fetch_mode="SINGLE")
    airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming)
    pyunit_utils.compare_frames(airlines_dataset_original,
                                airlines_dataset_streaming,
                                100,
                                tol_numeric=0)
def test_default_strategy_is_none():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)

    te_none = H2OTargetEncoderEstimator(data_leakage_handling="none", noise=0)
    te_none.train(y=ds.target, training_frame=ds.train)
    encoded_none = te_none.predict(ds.test)

    assert pu.compare_frames(encoded, encoded_none, 0, tol_numeric=1e-5)
Exemplo n.º 34
0
def h2o_H2OFrame_rep_len():
    """
    Python API test: h2o.frame.H2OFrame.rep_len(length_out)
    """
    row_num = randrange(1,10)
    col_num = randrange(1,10)
    length_out_r = math.ceil(0.78*row_num)
    python_lists = np.random.randint(-5,5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)

    one_column = h2oframe[0].rep_len(length_out=(length_out_r+row_num))       # one column, duplicate row
    assert_is_type(one_column, H2OFrame)    # check return type
        # check shape
    assert one_column.shape == (length_out_r+row_num, 1), "h2o.H2OFrame.rep_len() command is not working."

    # check values
    repeat_row_start = row_num
    repeat_row_end = row_num+length_out_r
    pyunit_utils.compare_frames(h2oframe[0:length_out_r,0], one_column[repeat_row_start:repeat_row_end, 0],
                                length_out_r, tol_time=0, tol_numeric=1e-6, strict=False, compare_NA=True)
Exemplo n.º 35
0
def test_transform_can_override_noise():
    ds = load_dataset(incl_test=True)
    noise = 1e-3
    te = H2OTargetEncoderEstimator(noise=noise, seed=seed)
    te.train(y=ds.target, training_frame=ds.train)
    transformed = te.transform(ds.test)
    transformed_no_noise = te.transform(ds.test, noise=0)
    try:
        assert pu.compare_frames(transformed,
                                 transformed_no_noise,
                                 0,
                                 tol_numeric=noise / 10)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)

    assert pu.compare_frames(transformed,
                             transformed_no_noise,
                             0,
                             tol_numeric=noise)
def continuous_or_categorical():
    numElements2Compare = 0
    tol_time = 200
    tol_numeric = 1e-5

    ctypes = ["enum"]*3
    h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), col_types=ctypes)
    h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/hexdev_29.orc"), col_types=ctypes)

    # compare the two frames
    assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \
        "H2O frame parsed from orc and csv files are different!"
Exemplo n.º 37
0
def h2o_H2OFrame_transpose():
    """
    Python API test: h2o.frame.H2OFrame.transpose()
    """
    row_num = randrange(1, 10)
    col_num = randrange(1, 10)
    python_lists = np.random.randint(-5, 5, (row_num, col_num))
    h2oframe = h2o.H2OFrame(python_obj=python_lists)
    newFrame = h2oframe.transpose()

    assert_is_type(newFrame, H2OFrame)  # check return type
    # check shape
    assert newFrame.shape == (
        h2oframe.ncol,
        h2oframe.nrow), "h2o.H2OFrame.transpose() command is not working."
    # check content
    pyunit_utils.compare_frames(h2oframe,
                                newFrame.transpose(),
                                h2oframe.nrow,
                                tol_time=0,
                                tol_numeric=1e-6)
def continuous_or_categorical_orc():
    numElements2Compare = 100
    tol_time = 200
    tol_numeric = 1e-5

    h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    data_types = ['real', 'real', 'real', 'real', 'enum']
    h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/iris.orc"), col_types = data_types)

    # compare the two frames
    assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \
        "H2O frame parsed from orc and csv files are different!"
Exemplo n.º 39
0
def h2o_H2OFrame_top_bottomN():
    """
    PUBDEV-3624 Top or Bottom N test h2o.frame.H2OFrame.topN() and h2o.frame.H2OFrame.bottomN() functions.
    Given a H2O frame, a column index or column name, a double denoting percentages of top/bottom rows to 
    return, the topN will return a H2OFrame containing two columns, one will
    be the topN (or bottomN) values of the specified column.  The other column will record the row indices into
    the original frame of where the topN (bottomN) values come from.  This will let the users to grab those
    corresponding rows to do whatever they want with it.
    """
    dataFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/TopBottomNRep4.csv.zip"))
    topAnswer = h2o.import_file(pyunit_utils.locate("smalldata/jira/Top20Per.csv.zip"))
    bottomAnswer = h2o.import_file(pyunit_utils.locate("smalldata/jira/Bottom20Per.csv.zip"))
    nPercentages = [1,2,3,4]  # multiples of 4 since dataset is repeated 4 times.
    frameNames = dataFrame.names    # get data column names
    tolerance=1e-12
    nsample = 100
    nP = nPercentages[randint(0, len(nPercentages)-1)]  # pick a random percentage
    colIndex = randint(0, len(frameNames)-1)    # pick a random column

    if (randint(0,2)==0):
        print("For topN: Percentage chosen is {0}.  Column index chosen is {1}".format(nP, colIndex))
        newTopFrame = dataFrame.topN(frameNames[colIndex], nP)  # call topN with column names
        newTopFrameC = dataFrame.topN(colIndex, nP)             # call topN with same column index

        # the two return frames should be the same for this case, compare 1000 rows chosen randomly
        pyunit_utils.compare_frames(newTopFrame, newTopFrameC, nsample, tol_numeric=tolerance)

        # compare one of the return frames with known answer
        compare_rep_frames(topAnswer, newTopFrame, tolerance, colIndex, 1)
    else:
        # test bottomN here
        print("For bottomN: Percentage chosen is {0}.  Column index chosen is {1}".format(nP, colIndex))
        newBottomFrame = dataFrame.bottomN(frameNames[colIndex], nP)  # call topN with column names
        newBottomFrameC = dataFrame.bottomN(colIndex, nP)             # call topN with same column index

        # the two return frames should be the same for this case
        pyunit_utils.compare_frames(newBottomFrame, newBottomFrameC, nsample, tol_numeric=tolerance)
        # compare one of the return frames with known answer
        compare_rep_frames(bottomAnswer, newBottomFrame, tolerance, colIndex, -1)
def import_folder():

  tol_time = 200              # comparing in ms or ns for timestamp columns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

  multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_small_csv.zip"))
  multi_file_csv = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_small_csv/all_airlines.csv"))

  # make sure H2O frames built from a zip file of a directory and the original files are the same.
  assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric,
                                     True), "H2O frame parsed from zip directory and unzipped directory are different!"
def import_folder():

    tol_time = 200  # comparing in ms or ns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

    multi_file_csv1 = h2o.import_file(
        path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/balunbal.csv")
    )
    multi_file_csv2 = h2o.import_file(
        path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/unbalbal.csv")
    )
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_separation"))

    # make sure orc multi-file and single big file create same H2O frame
    try:
        assert pyunit_utils.compare_frames(
            multi_file_orc, multi_file_csv1, numElements2Compare, tol_time, tol_numeric, True
        ), "H2O frame parsed from multiple orc and single orc files are different!"
    except:
        assert pyunit_utils.compare_frames(
            multi_file_orc, multi_file_csv2, numElements2Compare, tol_time, tol_numeric, True
        ), "H2O frame parsed from multiple orc and single orc files are different!"
def test_target_encoding_default_noise_is_applied():
    print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.")

    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()

    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  blended_avg= True, inflection_point = 3, smoothing = 1)

    targetEncoder.fit(frame=trainingFrame)

    seedTest = 1234
    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)

    encodingsOnly = encodedFrame[teColumnsEncoded]

    # Second transformation without specifying noise. Default will be applied.
    encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=seedTest)
    encodingsOnly2 = encodedFrame2[teColumnsEncoded]

    # Third  transformation with zero noise
    encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)
    encodingsOnly3 = encodedFrame3[teColumnsEncoded]

    # Comparing results
    # Third encoding should be equal to the first one since no noise is applied in both cases
    assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6)
    # First two encodings should be different since default noise will be applied to the second transformation
    try:
        pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Default noise is working')
def orc_parser_timestamp_date():
    """
    To verify that the orc parser is parsing correctly, we want to take a file we know (prostate_NA.csv), convert
    it to an Orc file (prostate_NA.orc) and build two H2O frames out of them.   We compare them and verified that
    they are the same.

    Nidhi did this manually in Hive and verified that the parsing is correct.  I am automating the test here.

    :return: None
    """

    tol_time = 200              # comparing in ms or ns
    tol_numeric = 1e-5          # tolerance for comparing other numeric fields
    numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

    h2oOrc = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/orc/prostate_NA.orc'))
    h2oCsv = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/csv2orc/prostate_NA.csv'))

    # compare the two frames
    assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
        "H2O frame parsed from orc and csv files are different!"
def import_folder():

  tol_time = 200              # comparing in ms or ns for timestamp columns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header"))
  multi_file_gzip_comp = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

            allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                           "/datasets/orc_parser/orc/orc_split_elim.orc"]

            allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                           "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                           "/datasets/orc_parser/csv/orc_split_elim.csv"]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def import_folder():

  tol_time = 200              # comparing in ms or ns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

  # compressed the whole directory of files.
  multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

  # directory containing the gzip version of csv files here.
  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Exemplo n.º 47
0
def group_by_all():
    """
    This is a comprehenisve test that will test all aggregations in the groupBy class.
    """
    generate_dict_answers() # generate answer dictionary

    # perform group-by with datasets containing no NAs. All three na mode should produce same results
    h2o_iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    result_all = perform_group_by(h2o_iris,'all')
    result_ignore = perform_group_by(h2o_iris,'ignore')
    result_rm = perform_group_by(h2o_iris, 'rm')

    # make sure return type of get_frame() is H2OFrame
    assert_is_type(result_all, H2OFrame)
    assert_is_type(result_ignore, H2OFrame)
    assert_is_type(result_rm, H2OFrame)

    # make sure the result frame contains the correct number of rows and columns
    assert result_all.shape==result_ignore.shape==result_rm.shape==(3,30), "H2O group_by() command is not working."

    # check all group by results are the same
    assert pyunit_utils.compare_frames(result_all, result_ignore, 0, 0, 1e-6, strict=True, compare_NA=False), \
        "H2O group_by() command is not working."
    assert pyunit_utils.compare_frames(result_ignore, result_rm, 0, 0, 1e-6, strict=True, compare_NA=False), \
        "H2O group_by() command is not working."

    # check group by result with known correct result
    assert_group_by_result(result_all, g_iris_setosa_sepal_len, "Iris-setosa")
    assert_group_by_result(result_rm, g_iris_versicolor_sepal_wid, "Iris-versicolor")
    assert_group_by_result(result_ignore, g_iris_virginica_petal_wid, "Iris-virginica")

    # perform group-by with datasets contain NAs.
    h2o_iris_NA = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader_NA_2.csv"))
    result_all_NA = perform_group_by(h2o_iris_NA,'all')
    result_ignore_NA = perform_group_by(h2o_iris_NA,'ignore')
    result_rm_NA = perform_group_by(h2o_iris_NA, 'rm')

    # make sure return type of get_frame() is H2OFrame
    assert_is_type(result_all_NA, H2OFrame)
    assert_is_type(result_ignore_NA, H2OFrame)
    assert_is_type(result_rm_NA, H2OFrame)

    # make sure the result frame contains the correct number of rows and columns
    assert result_all_NA.shape==result_ignore_NA.shape==result_rm_NA.shape==(3,30), \
        "H2O group_by() command is not working."

    # column petal_wid contains no NA and hence should provide same result as before independent of NA treatment
    assert pyunit_utils.compare_frames(result_all_NA[list(g_iris_virginica_petal_wid.keys())],
                                       result_rm_NA[list(g_iris_virginica_petal_wid.keys())], 0, 0, 1e-6,
                                       strict=False, compare_NA=False), "H2O group_by() command is not working."
    assert pyunit_utils.compare_frames(result_all_NA[list(g_iris_virginica_petal_wid.keys())],
                                       result_ignore_NA[list(g_iris_virginica_petal_wid.keys())], 0, 0, 1e-6,
                                       strict=False, compare_NA=False), "H2O group_by() command is not working."
    assert_group_by_result(result_all_NA, g_iris_virginica_petal_wid, "Iris-virginica")

    # check to make sure result_all_NA columns for sepal_len, sepal_wid, petal_len are all NAs for na='all'
    assert_all_NAs(result_all_NA, list(g_iris_setosa_sepal_len.keys()))  # check sepal_len
    assert_all_NAs(result_all_NA, list(g_iris_versicolor_sepal_wid.keys()))  # check sepal_wid
    assert_all_NAs(result_all_NA, list(g_iris_versicolor_petal_len_NA_ignore.keys()))  # check petal_len

    # check to make sure na="ignore", and na="rm" are calculated correctly against known answers
    assert_group_by_result(result_ignore_NA, g_iris_versicolor_petal_len_NA_ignore, "Iris-versicolor")
    assert_group_by_result(result_rm_NA, g_iris_versicolor_petal_len_NA_rm, "Iris-versicolor")