def test_basic_sqlite_pipeline(): @sqlite_table def TableOne(context): context.resources.conn.execute('''CREATE TABLE TableOne AS SELECT 1 as num''') context.resources.conn.commit() @sqlite_table def TableTwo(context): context.resources.conn.execute('''CREATE TABLE TableTwo AS SELECT 2 as num''') context.resources.conn.commit() @sqlite_table( input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)] ) def TableThree(context, **_kwargs): context.resources.conn.execute( 'CREATE TABLE TableThree AS SELECT num from TableOne UNION SELECT num from TableTwo' ) context.resources.conn.commit() conn = sqlite3.connect(':memory:') pipeline_def = construct_lakehouse_pipeline( name='sqllite_lakehouse_pipeline', lakehouse_tables=[TableOne, TableTwo, TableThree], resources={'conn': conn, 'lakehouse': SqlLiteLakehouse()}, ) result = execute_pipeline(pipeline_def) assert result.success assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [(1,), (2,)]
def test_file_based_sqlite_pipeline(): def path_for_table(table_name): return file_relative_path( __file__, 'basic_sqllite_test_files/{table_name}.sql'.format( table_name=table_name)) TableOne = create_sqllite_table_from_file(path_for_table('TableOne')) TableTwo = create_sqllite_table_from_file(path_for_table('TableTwo')) TableThree = create_sqllite_table_from_file( path_for_table('TableThree'), input_tables=[ input_table('table_one', TableOne), input_table('table_two', TableTwo) ], ) conn = sqlite3.connect(':memory:') pipeline_def = construct_lakehouse_pipeline( name='sqllite_lakehouse_pipeline', lakehouse_tables=[TableOne, TableTwo, TableThree], resources={ 'conn': conn, 'lakehouse': SqlLiteLakehouse() }, ) result = execute_pipeline(pipeline_def) assert result.success assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [ (1, ), (2, ) ]
FEATURE_TWO = 'feature_two' @this_pyspark_table(feature_area=FEATURE_ONE) def TableOne(context) -> SparkDF: return context.resources.spark.createDataFrame([Row(num=1)]) @this_pyspark_table(feature_area=FEATURE_ONE) def TableTwo(context) -> SparkDF: return context.resources.spark.createDataFrame([Row(num=2)]) @this_pyspark_table( input_tables=[ input_table('table_one', TableOne), input_table('table_two', TableTwo) ], feature_area=FEATURE_TWO, ) def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF: return table_one.union(table_two) class ByFeatureParquetLakehouse(Lakehouse): def __init__(self, root_dir): self.lakehouse_path = check.str_param(root_dir, 'root_dir') def _path_for_table(self, table_type, table_metadata): return os.path.join(self.lakehouse_path, table_metadata[FEATURE_AREA], table_type.name)
STRING_TABLE_STRUCT_TYPE = spark_type_from_kwargs(id=int, string=str) @typed_pyspark_table(spark_type=STRING_TABLE_STRUCT_TYPE) def StringTable(context) -> SparkDF: return context.resources.spark.createDataFrame([Row(id=1, string='23')], STRING_TABLE_STRUCT_TYPE) JOIN_TABLE_STRUCT_TYPE = spark_type_from_kwargs(id=int, number=int, string=str) @typed_pyspark_table( input_tables=[ input_table('number_df', NumberTable), input_table('string_df', StringTable) ], spark_type=JOIN_TABLE_STRUCT_TYPE, description='Joining together of the number and the string.', ) def JoinTable(_context, number_df: NumberTable, string_df: StringTable) -> SparkDF: return number_df.join(string_df, number_df.id == string_df.id, 'inner').drop(string_df.id) def test_execute_typed_in_mem_lakehouse(): lakehouse = TypedPySparkMemLakehouse() pipeline_result = execute_spark_lakehouse_build( tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse)