def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow,
                          cols=ncol,
                          real_fraction=frac1,
                          categorical_fraction=frac1,
                          integer_fraction=frac1,
                          binary_fraction=frac1,
                          time_fraction=frac1,
                          string_fraction=frac2,
                          missing_fraction=0.1,
                          has_response=False,
                          seed=seed)
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath,
                                            skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def test_parquet_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquetNoSkip = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet"))
    pyunit_utils.compare_frames_local(csv, parquetNoSkip, prob=1)  # should be the same here.

    path = pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol//2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
示例#3
0
def test_parquet_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    csv = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquetNoSkip = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/parquet/airlines-simple.snappy.parquet"))
    pyunit_utils.compare_frames_local(csv, parquetNoSkip,
                                      prob=1)  # should be the same here.

    path = pyunit_utils.locate(
        "smalldata/parser/parquet/airlines-simple.snappy.parquet")
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
示例#4
0
def import_zip_skipped_columns():
    # checking out zip file
    airlineFull = h2o.import_file(
        path=pyunit_utils.locate("smalldata/jira/adult.gz"))
    filePath = pyunit_utils.locate("smalldata/jira/adult.gz")

    skip_all = list(range(airlineFull.ncol))
    skip_even = list(range(0, airlineFull.ncol, 2))
    skip_odd = list(range(1, airlineFull.ncol, 2))
    skip_start_end = [0, airlineFull.ncol - 1]
    skip_except_last = list(range(0, airlineFull.ncol - 2))
    skip_except_first = list(range(1, airlineFull.ncol))
    temp = list(range(0, airlineFull.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, airlineFull.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        bad = h2o.import_file(filePath,
                              skipped_columns=skip_all)  # skipped all
        assert False, "Test should have thrown an exception due to all columns are skipped"  # should have failed here
    except Exception as ex:
        print(ex)
        pass

    try:
        bad = h2o.upload_file(filePath,
                              skipped_columns=skip_all)  # skipped all
        assert False, "Test should have thrown an exception due to all columns are skipped"  # should have failed here
    except Exception as ex:
        print(ex)
        pass

        # skip odd columns
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_odd)

    # skip even columns
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_even)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_random)
示例#5
0
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1,
                                      prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_gzip_skipped_columns():
    # checking out zip file
    airlineCSV = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv"))
    filePath = pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")

    skip_all = list(range(airlineCSV.ncol))
    skip_even = list(range(0, airlineCSV.ncol, 2))
    skip_odd = list(range(1, airlineCSV.ncol, 2))
    skip_start_end = [0, airlineCSV.ncol - 1]
    skip_except_last = list(range(0, airlineCSV.ncol - 2))
    skip_except_first = list(range(1, airlineCSV.ncol))
    temp = list(range(0, airlineCSV.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, airlineCSV.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        bad = h2o.import_file(filePath,
                              skipped_columns=skip_all)  # skipped all
        sys.exit(1)
    except Exception as ex:
        print(ex)
        pass

    try:
        bad = h2o.upload_file(filePath,
                              skipped_columns=skip_all)  # skipped all
        sys.exit(1)
    except Exception as ex:
        print(ex)
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_random)
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/csv2orc/prostate_NA.csv"),
                                     na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/prostate_NA.orc"))
    pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=0.01)  # should be the same here.

    path = pyunit_utils.locate("smalldata/parser/orc/prostate_NA.orc")
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol//2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except Exception as ex:
        print(ex)
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except Exception as ex:
        print(ex)
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1,
                          binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1,
                          has_response=False, seed=seed)
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_even = list(range(0, f1.ncol, 2))
    skip_odd = list(range(1, f1.ncol, 2))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def import_gzip_skipped_columns():
    # checking out zip file
    airlineCSV = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv"))
    filePath = pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")

    skip_all = list(range(airlineCSV.ncol))
    skip_even = list(range(0, airlineCSV.ncol, 2))
    skip_odd = list(range(1, airlineCSV.ncol, 2))
    skip_start_end = [0, airlineCSV.ncol - 1]
    skip_except_last = list(range(0, airlineCSV.ncol - 2))
    skip_except_first = list(range(1, airlineCSV.ncol))
    temp = list(range(0, airlineCSV.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, airlineCSV.ncol//2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        bad = h2o.import_file(filePath, skipped_columns=skip_all)  # skipped all
        sys.exit(1)
    except Exception as ex:
        print(ex)
        pass

    try:
        bad = h2o.upload_file(filePath, skipped_columns=skip_all)   # skipped all
        sys.exit(1)
    except Exception as ex:
        print(ex)
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_random)