def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def test_parquet_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquetNoSkip = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")) pyunit_utils.compare_frames_local(csv, parquetNoSkip, prob=1) # should be the same here. path = pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet") skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol//2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def test_parquet_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings csv = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquetNoSkip = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/parquet/airlines-simple.snappy.parquet")) pyunit_utils.compare_frames_local(csv, parquetNoSkip, prob=1) # should be the same here. path = pyunit_utils.locate( "smalldata/parser/parquet/airlines-simple.snappy.parquet") skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_zip_skipped_columns(): # checking out zip file airlineFull = h2o.import_file( path=pyunit_utils.locate("smalldata/jira/adult.gz")) filePath = pyunit_utils.locate("smalldata/jira/adult.gz") skip_all = list(range(airlineFull.ncol)) skip_even = list(range(0, airlineFull.ncol, 2)) skip_odd = list(range(1, airlineFull.ncol, 2)) skip_start_end = [0, airlineFull.ncol - 1] skip_except_last = list(range(0, airlineFull.ncol - 2)) skip_except_first = list(range(1, airlineFull.ncol)) temp = list(range(0, airlineFull.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, airlineFull.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: bad = h2o.import_file(filePath, skipped_columns=skip_all) # skipped all assert False, "Test should have thrown an exception due to all columns are skipped" # should have failed here except Exception as ex: print(ex) pass try: bad = h2o.upload_file(filePath, skipped_columns=skip_all) # skipped all assert False, "Test should have thrown an exception due to all columns are skipped" # should have failed here except Exception as ex: print(ex) pass # skip odd columns pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_odd) # skip even columns pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_even) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(airlineFull, filePath, skip_random)
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_gzip_skipped_columns(): # checking out zip file airlineCSV = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv")) filePath = pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip") skip_all = list(range(airlineCSV.ncol)) skip_even = list(range(0, airlineCSV.ncol, 2)) skip_odd = list(range(1, airlineCSV.ncol, 2)) skip_start_end = [0, airlineCSV.ncol - 1] skip_except_last = list(range(0, airlineCSV.ncol - 2)) skip_except_first = list(range(1, airlineCSV.ncol)) temp = list(range(0, airlineCSV.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, airlineCSV.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: bad = h2o.import_file(filePath, skipped_columns=skip_all) # skipped all sys.exit(1) except Exception as ex: print(ex) pass try: bad = h2o.upload_file(filePath, skipped_columns=skip_all) # skipped all sys.exit(1) except Exception as ex: print(ex) pass # skip even columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_random)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/csv2orc/prostate_NA.csv"), na_strings=['\\N']) multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/prostate_NA.orc")) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=0.01) # should be the same here. path = pyunit_utils.locate("smalldata/parser/orc/prostate_NA.orc") skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol//2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except Exception as ex: print(ex) pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except Exception as ex: print(ex) pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def test_csv_parser_column_skip(): # generate a big frame with all datatypes and save it to csv. Load it back with different skipped_columns settings nrow = 10000 ncol = 100 seed = 12345 frac1 = 0.16 frac2 = 0.2 f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1, binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1, has_response=False, seed=seed) tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results")) if not (os.path.isdir(tmpdir)): os.mkdir(tmpdir) savefilenamewithpath = os.path.join(tmpdir, 'in.csv') h2o.download_csv(f1, savefilenamewithpath) # load in whole dataset skip_all = list(range(f1.ncol)) skip_even = list(range(0, f1.ncol, 2)) skip_odd = list(range(1, f1.ncol, 2)) skip_start_end = [0, f1.ncol - 1] skip_except_last = list(range(0, f1.ncol - 2)) skip_except_first = list(range(1, f1.ncol)) temp = list(range(0, f1.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, f1.ncol // 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def import_gzip_skipped_columns(): # checking out zip file airlineCSV = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv")) filePath = pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip") skip_all = list(range(airlineCSV.ncol)) skip_even = list(range(0, airlineCSV.ncol, 2)) skip_odd = list(range(1, airlineCSV.ncol, 2)) skip_start_end = [0, airlineCSV.ncol - 1] skip_except_last = list(range(0, airlineCSV.ncol - 2)) skip_except_first = list(range(1, airlineCSV.ncol)) temp = list(range(0, airlineCSV.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, airlineCSV.ncol//2): skip_random.append(temp[index]) skip_random.sort() try: bad = h2o.import_file(filePath, skipped_columns=skip_all) # skipped all sys.exit(1) except Exception as ex: print(ex) pass try: bad = h2o.upload_file(filePath, skipped_columns=skip_all) # skipped all sys.exit(1) except Exception as ex: print(ex) pass # skip even columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(airlineCSV, filePath, skip_random)