def sql_table(): #conn_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" #conn_url = "jdbc:postgresql://mr-0xf2/ingestSQL" conn_url = os.getenv("SQLCONNURL") table = "citibike20k" #figure out username and password db_type = conn_url.split(":",3)[1] username = password = "" if db_type == "mysql": username = "******" password = "******" elif db_type == "postgresql": username = password = "******" citi_sql = h2o.import_sql_table(conn_url, table, username, password) citi_csv = h2o.import_file(pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert first_1000_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table(conn_url, table, username, password, ["starttime", "bikeid"]) assert citi_sql.nrow == 2e4 assert citi_sql.ncol == 2 sql_select = h2o.import_sql_select(conn_url, "SELECT starttime FROM citibike20k", username, password) assert sql_select.nrow == 2e4 assert sql_select.ncol == 1
def sql_table(): #conn_url = "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false" #conn_url = "jdbc:postgresql://mr-0xf2/ingestSQL" conn_url = os.getenv("SQLCONNURL") table = "citibike20k" #figure out username and password db_type = conn_url.split(":", 3)[1] username = password = "" if db_type == "mysql": username = "******" password = "******" elif db_type == "postgresql": username = password = "******" citi_sql = h2o.import_sql_table(conn_url, table, username, password) citi_csv = h2o.import_file( pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert first_1000_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table(conn_url, table, username, password, ["starttime", "bikeid"]) assert citi_sql.nrow == 2e4 assert citi_sql.ncol == 2 sql_select = h2o.import_sql_select(conn_url, "SELECT starttime FROM citibike20k", username, password) assert sql_select.nrow == 2e4 assert sql_select.ncol == 1
def hive_save_frame(): connection_url = "jdbc:hive2://localhost:10000/default" connection_url_nodb = "jdbc:hive2://localhost:10000/" krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' use_token = os.getenv('KRB_USE_TOKEN', 'false').lower() == 'true' if krb_enabled: if use_token: connection_url += ";auth=delegationToken" connection_url_nodb += ";auth=delegationToken" else: connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') connection_url_nodb += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') username = "******" password = "" print("import data") prostate_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) print("save as csv, managed, custom tmp") prostate_hex.save_to_hive(connection_url, "prostate_hex_py_csv", tmp_path="/tmp") prostate_hive = h2o.import_sql_table(connection_url, "prostate_hex_py_csv", username, password, fetch_mode="SINGLE") pyunit_utils.compare_frames_local(prostate_hex, prostate_hive, prob=1) print("save as parquet, managed, db in table name") prostate_hex.save_to_hive(connection_url_nodb, "default.prostate_hex_py_parquet", format="parquet") prostate_hive = h2o.import_sql_table(connection_url_nodb, "default.prostate_hex_py_parquet", username, password, fetch_mode="SINGLE") pyunit_utils.compare_frames_local(prostate_hex, prostate_hive, prob=1) print("save as parquet, external") prostate_hex.save_to_hive(connection_url, "prostate_hex_py_parquet_ext", format="parquet", table_path="/user/hive/ext/prostate_hex_py_parquet") prostate_hive = h2o.import_sql_table(connection_url, "prostate_hex_py_parquet_ext", username, password, fetch_mode="SINGLE") pyunit_utils.compare_frames_local(prostate_hex, prostate_hive, prob=1)
def sql_table(): citi_sql = h2o.import_sql_table(os.environ['SQLCONNURL'], "citibike20k", "root", "0xdata") citi_csv = h2o.import_file(pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert is_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table(os.environ['SQLCONNURL'], "citibike20k", "root", "0xdata", ["starttime", "bikeid"]) assert citi_sql.ncol == 2
def sql_table(): citi_sql = h2o.import_sql_table("jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false", "citibike20k", "root", "0xdata") citi_csv = h2o.import_file(pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert is_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table("jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false", "citibike20k", "root", "0xdata", ["starttime", "bikeid"]) assert citi_sql.ncol == 2
def sql_table(): citi_sql = h2o.import_sql_table(os.environ['SQLCONNURL'], "citibike20k", "root", "0xdata") citi_csv = h2o.import_file( pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert is_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table(os.environ['SQLCONNURL'], "citibike20k", "root", "0xdata", ["starttime", "bikeid"]) assert citi_sql.ncol == 2
def hive_import(): hdfs_name_node = pyunit_utils.hadoop_namenode() hive_host = os.getenv("HIVE_HOST") connection_url = "jdbc:hive2://{0}:10000/default".format(hive_host) krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' if krb_enabled: connection_url += ";auth=delegationToken" # read original file_url = "hdfs://{0}{1}".format( hdfs_name_node, "/user/jenkins/smalldata/chicago/chicagoCensus.csv") dataset_original = h2o.import_file(file_url) # read TABLE from Hive JDBC table_jdbc = h2o.import_sql_table(connection_url, "chicago", "", "", fetch_mode="SINGLE") table_jdbc = adapt_frame(table_jdbc, column_prefix="chicago.") pyunit_utils.compare_frames_local(dataset_original, table_jdbc, prob=1) # read TABLE from Hive FS table_direct = h2o.import_hive_table(connection_url, "chicago") table_direct = adapt_frame(table_direct) pyunit_utils.compare_frames_local(dataset_original, table_direct, prob=1)
def sql_table(): citi_sql = h2o.import_sql_table( "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false", "citibike20k", "root", "0xdata") citi_csv = h2o.import_file( pyunit_utils.locate("smalldata/demos/citibike_20k.csv")) py_citi_sql = citi_sql.as_data_frame(False)[1:] #don't compare headers py_citi_csv = citi_csv.as_data_frame(False)[1:] assert is_equal(py_citi_sql, py_citi_csv) citi_sql = h2o.import_sql_table( "jdbc:mysql://172.16.2.178:3306/ingestSQL?&useSSL=false", "citibike20k", "root", "0xdata", ["starttime", "bikeid"]) assert citi_sql.ncol == 2
def sql_table(): pet = h2o.import_sql_table("mysql", "menagerie", "pet", "root", "ludi") print(pet)