def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() # run a quick test to determine if the hive-exec is too old. if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py")) pass else: numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc,col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py")) pass else: allOrcFiles = [ "/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc", "/datasets/orc_parser/orc/bigint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc", "/datasets/orc_parser/orc/bool_single_col.orc", "/datasets/orc_parser/orc/demo-11-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/demo-12-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/double_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc", "/datasets/orc_parser/orc/float_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc", "/datasets/orc_parser/orc/int_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc", "/datasets/orc_parser/orc/nulls-at-end-snappy.orc", "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc", "/datasets/orc_parser/orc/orc_split_elim.orc", "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc", "/datasets/orc_parser/orc/smallint_single_col.orc", "/datasets/orc_parser/orc/string_single_col.orc", "/datasets/orc_parser/orc/tinyint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc" ] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) tab_test = h2o.import_file(url_orc) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format( "pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: origTZ = h2o.cluster().timezone newZone = 'America/Los_Angeles' h2o.cluster().timezone = newZone tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = [ "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc" ] allCsvFiles = [ "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv" ] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" h2o.cluster().timezone = origTZ else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py")) pass else: allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc", "/datasets/orc_parser/orc/bigint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc", "/datasets/orc_parser/orc/bool_single_col.orc", "/datasets/orc_parser/orc/demo-11-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/demo-12-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/double_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc", "/datasets/orc_parser/orc/float_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc", "/datasets/orc_parser/orc/int_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc", "/datasets/orc_parser/orc/nulls-at-end-snappy.orc", "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc", "/datasets/orc_parser/orc/orc_split_elim.orc", "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc", "/datasets/orc_parser/orc/smallint_single_col.orc", "/datasets/orc_parser/orc/string_single_col.orc", "/datasets/orc_parser/orc/tinyint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) tab_test = h2o.import_file(url_orc) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print( "Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: mix_folder = "/datasets/orc_csv_same_milsongs" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder) multi_file_mixed = h2o.import_file(url_csv1) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: mix_folder = "/datasets/milsongs_orc_air_csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder) multi_file_mixed = h2o.import_file(url_csv1) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc"] allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() # run a quick test to determine if the hive-exec is too old. if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/airlines_all_orc_parts" hdfs_csv_file = "/datasets/air_csv_part" col_types = [ 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum' ] # import CSV file print( "Import airlines 116M dataset in original csv format from HDFS" ) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) endcsv = time.time() startcsv1 = time.time() multi_file_csv1 = h2o.import_file(url_csv) endcsv1 = time.time() h2o.remove(multi_file_csv1) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] # import ORC file with same column types as CSV file print("Import airlines 116M dataset in ORC format from HDFS") url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] print( "************** CSV (without column type forcing) parse time is {0}" .format(endcsv1 - startcsv1)) print( "************** CSV (with column type forcing) parse time is {0}" .format(endcsv - startcsv)) print( "************** ORC (without column type forcing) parse time is {0}" .format(endorc1 - startorc1)) print( "************** ORC (with column type forcing) parse time is {0}" .format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/airlines_all_orc_parts" hdfs_csv_file = "/datasets/air_csv_part" col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] # import CSV file print("Import airlines 116M dataset in original csv format from HDFS") url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) endcsv = time.time() startcsv1 = time.time() multi_file_csv1 = h2o.import_file(url_csv) endcsv1 = time.time() h2o.remove(multi_file_csv1) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] # import ORC file with same column types as CSV file print("Import airlines 116M dataset in ORC format from HDFS") url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1)) print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError