def test_get_table_def(self): data = [ (1001, 1, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, 2, "John", "Doe", "1988-05-03", 33.0, False), (2201, 3, "Elonzo", "Smith", "1990-05-03", 21.0, True), (None, None, None, None, None, None, None) # Test Nulls ] df = get_spark_session()\ .createDataFrame(data, ["id", "dept_id", "first_name", "last_name", "dob", "age", "is_temp"])\ .createOrReplaceTempView("employees") df = get_spark_session().sql( "select id, cast(dept_id as short), first_name, " "last_name, dob, age, is_temp from employees") table_def = get_table_def(df, "Extract", "Extract") # Ensure that the Table Name matches assert (table_def.table_name.name == Name("Extract")) # Ensure that the the TableDefinition column names match assert (table_def.get_column(0).name == Name("id")) assert (table_def.get_column(1).name == Name("dept_id")) assert (table_def.get_column(2).name == Name("first_name")) assert (table_def.get_column(3).name == Name("last_name")) assert (table_def.get_column(4).name == Name("dob")) assert (table_def.get_column(5).name == Name("age")) assert (table_def.get_column(6).name == Name("is_temp")) # Ensure that the column data types were converted correctly assert (table_def.get_column(0).type == SqlType.big_int()) assert (table_def.get_column(1).type == SqlType.small_int()) assert (table_def.get_column(2).type == SqlType.text()) assert (table_def.get_column(3).type == SqlType.text()) assert (table_def.get_column(4).type == SqlType.text()) assert (table_def.get_column(5).type == SqlType.double()) assert (table_def.get_column(6).type == SqlType.bool())
def test_print_table_definition(self): data = [(1001, "Jane", "Doe", "2000-05-01", 29, False), (1002, "John", "Doe", "1988-05-03", 29, False), (2201, "Elonzo", "Smith", "1990-05-03", 29, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) hf = HyperFile(name="employees", df=df) hf.print_table_def()
def test_get_rows(self): data = [(1001, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, "John", "Doe", "1988-05-03", 33.0, False), (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) rows = get_rows(df) expected_row = [1001, "Jane", "Doe", "2000-05-01", 29.0, False] assert (len(rows) == 3) assert (rows[0] == expected_row)
def test_write_parquet_to_local_file_system(self): data = [(1001, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, "John", "Doe", "1988-05-03", 33.0, False), (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True), (2202, "James", "Towdry", "1980-05-03", 45.0, False), (2235, "Susan", "Sanders", "1980-05-03", 43.0, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) parquet_file = write_parquet_to_local_file_system(df, "employees") assert (parquet_file.startswith("/tmp/hyperleaup/employees/"))
def test_print_rows(self): # Ensure that a HyperFile can be created from a Spark DataFrame data = [(1001, "Jane", "Doe", "2000-05-01", 29, False), (1002, "John", "Doe", "1988-05-03", 29, False), (2201, "Elonzo", "Smith", "1990-05-03", 29, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) hf = HyperFile(name="employees", df=df) hf.print_rows() # Ensure that a HyperFile can be created from Spark SQL data = [(101, "IT"), (103, "Engineering"), (104, "Management"), (105, "HR")] get_spark_session()\ .createDataFrame(data, ["id", "department"])\ .createOrReplaceGlobalTempView("departments") sql = "SELECT * FROM global_temp.departments" hf = HyperFile(name="employees", sql=sql) hf.print_rows()
def test_creation_mode(self): data = [(1001, "Jane", "Doe", "2000-05-01", 29, False), (1002, "John", "Doe", "1988-05-03", 29, False), (2201, "Elonzo", "Smith", "1990-05-03", 29, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) hf = HyperFile(name="employees", df=df, is_dbfs_enabled=False, creation_mode="insert") assert (hf.path == "/tmp/hyperleaup/employees/employees.hyper")
def test_append(self): # Ensure that new data can be appended to an existing Hyper File existing_hf_path = '/tmp/save/employees.hyper' hf = HyperFile.load(path=existing_hf_path, is_dbfs_enabled=False) num_rows = TestUtils.get_row_count("Extract", "Extract", "/tmp/save/employees.hyper") assert (num_rows == 3) # Create new data data = [(3001, "Will", "Girten", "1990-05-01", 31, True), (3002, "Sammy", "Smith", "1988-05-03", 29, True), (3003, "Gregory", "Denver", "1990-05-03", 29, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) hf.append(df=df) num_rows = TestUtils.get_row_count("Extract", "Extract", "/tmp/save/employees.hyper") assert (num_rows == 6)
def test_creation_mode(self): data = [ (1001, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, "John", "Doe", "1988-05-03", 33.0, False), (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True), (2202, None, None, "1980-05-03", 45.0, False), # Add a few nulls (2235, "", "", "1980-05-03", 43.0, True) ] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) # creation_mode using a str creator = Creator(df=df, name='employees', is_dbfs_enabled=False, creation_mode="Insert") hyper_file_path = creator.create() assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper") num_rows = TestUtils.get_row_count( "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper") assert (num_rows == 5)
def test_save(self): data = [(1001, "Jane", "Doe", "2000-05-01", 29, False), (1002, "John", "Doe", "1988-05-03", 29, False), (2201, "Elonzo", "Smith", "1990-05-03", 29, True)] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) hf = HyperFile(name="employees", df=df, is_dbfs_enabled=False, creation_mode="insert") # Ensure that the Hyper File can be saved to an alternative location current_path = hf.path new_path = '/tmp/save/' expected_path = '/tmp/save/employees.hyper' hf.save(new_path) # Save operation should not update the current Hyper File's path assert (current_path == hf.path) assert (os.path.exists(expected_path)) assert (os.path.isfile(expected_path))
def test_create(self): data = [ (1001, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, "John", "Doe", "1988-05-03", 33.0, False), (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True), (2202, None, None, "1980-05-03", 45.0, False), # Add a few nulls (2235, "", "", "1980-05-03", 43.0, True) ] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) # Ensure that a Hyper file can be created with date and timestamp columns df.withColumn("hire_date", current_date()) df.withColumn("last_updated", current_timestamp()) creator = Creator(df, 'employees', False) hyper_file_path = creator.create() assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper") tables = TestUtils.get_tables( "Extract", "/tmp/hyperleaup/employees/employees.hyper") assert (len(tables) == 1) num_rows = TestUtils.get_row_count( "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper") assert (num_rows == 5)
def get_spark_dataframe(sql) -> DataFrame: return get_spark_session().sql(sql)