def test_missing_resource(): with pytest.raises(DagsterInvalidDefinitionError): @pyspark_table def missing(_): pass construct_lakehouse_pipeline('test', lakehouse_tables=[missing], resources={})
def test_missing_resource(): with pytest.raises(DagsterInvalidDefinitionError): @lakehouse_table(required_resource_keys={'foo'}) def missing(_): pass construct_lakehouse_pipeline('test', lakehouse_tables=[missing])
def test_snowflake(): construct_lakehouse_pipeline( name='snowflake_lake', lakehouse_tables=[TableOne], resources={ 'snowflake': snowflake_resource, 'lakehouse': SnowflakeLakehouse() }, )
def test_execute_byfeature_parquet_lakehouse(): with get_temp_dir() as temp_dir: lakehouse = ByFeatureParquetLakehouse(temp_dir) pipeline_def = construct_lakehouse_pipeline( name='test', lakehouse_tables=[TableOne, TableTwo, TableThree], mode_defs=[ ModeDefinition( resource_defs={ 'spark': spark_session_resource, 'lakehouse': ResourceDefinition.hardcoded_resource(lakehouse), }) ], ) pipeline_result = execute_pipeline(pipeline_def) assert pipeline_result.success def get_table(table_def): spark = spark_session_from_config() return spark.read.parquet( os.path.join(temp_dir, table_def.metadata[FEATURE_AREA], table_def.name)).collect() assert get_table(TableOne) == [Row(num=1)] assert get_table(TableTwo) == [Row(num=2)] assert set(get_table(TableThree)) == set([Row(num=1), Row(num=2)])
def test_basic_sqlite_pipeline(): @sqlite_table def TableOne(context): context.resources.conn.execute('''CREATE TABLE TableOne AS SELECT 1 as num''') context.resources.conn.commit() @sqlite_table def TableTwo(context): context.resources.conn.execute('''CREATE TABLE TableTwo AS SELECT 2 as num''') context.resources.conn.commit() @sqlite_table( input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)] ) def TableThree(context, **_kwargs): context.resources.conn.execute( 'CREATE TABLE TableThree AS SELECT num from TableOne UNION SELECT num from TableTwo' ) context.resources.conn.commit() conn = sqlite3.connect(':memory:') pipeline_def = construct_lakehouse_pipeline( name='sqllite_lakehouse_pipeline', lakehouse_tables=[TableOne, TableTwo, TableThree], resources={'conn': conn, 'lakehouse': SqlLiteLakehouse()}, ) result = execute_pipeline(pipeline_def) assert result.success assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [(1,), (2,)]
def test_file_based_sqlite_pipeline(): def path_for_table(table_name): return file_relative_path( __file__, 'basic_sqllite_test_files/{table_name}.sql'.format( table_name=table_name)) TableOne = create_sqllite_table_from_file(path_for_table('TableOne')) TableTwo = create_sqllite_table_from_file(path_for_table('TableTwo')) TableThree = create_sqllite_table_from_file( path_for_table('TableThree'), input_tables=[ input_table('table_one', TableOne), input_table('table_two', TableTwo) ], ) conn = sqlite3.connect(':memory:') pipeline_def = construct_lakehouse_pipeline( name='sqllite_lakehouse_pipeline', lakehouse_tables=[TableOne, TableTwo, TableThree], resources={ 'conn': conn, 'lakehouse': SqlLiteLakehouse() }, ) result = execute_pipeline(pipeline_def) assert result.success assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [ (1, ), (2, ) ]
def _execute_spark_lakehouse_build(tables, lakehouse, environment_dict=None): return execute_pipeline( construct_lakehouse_pipeline( name='spark_lakehouse_pipeline', lakehouse_tables=tables, resources={'lakehouse': lakehouse, 'spark': spark_session_resource}, ), environment_dict=environment_dict, )
def execute_spark_lakehouse_build(tables, lakehouse, environment_dict=None): return execute_pipeline( construct_lakehouse_pipeline( name='spark_lakehouse_pipeline', lakehouse_tables=tables, mode_defs=[ ModeDefinition( resource_defs={ 'spark': spark_session_resource, 'lakehouse': ResourceDefinition.hardcoded_resource(lakehouse), }) ], ), environment_dict=environment_dict, )
], spark_type=JOIN_TABLE_STRUCT_TYPE, description='Joining together of the number and the string.', ) def JoinTable(_context, number_df: NumberTable, string_df: StringTable) -> SparkDF: return number_df.join(string_df, number_df.id == string_df.id, 'inner').drop(string_df.id) def test_execute_typed_in_mem_lakehouse(): lakehouse = TypedPySparkMemLakehouse() pipeline_result = execute_spark_lakehouse_build( tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse) assert pipeline_result.success # Row ordering varies on 3.5 - compare as dicts assert (lakehouse.collected_tables['JoinTable'][0].asDict() == Row( id=1, number=2, string='23').asDict()) # for dagit typed_lakehouse_pipeline = construct_lakehouse_pipeline( name='typed_lakehouse_pipeline', lakehouse_tables=[NumberTable, StringTable, JoinTable], resources={ 'lakehouse': typed_pyspark_mem_lakehouse, 'spark': spark_session_resource }, )
return number_df.join(string_df, number_df.id == string_df.id, 'inner').drop(string_df.id) def test_execute_typed_in_mem_lakehouse(): lakehouse = TypedPySparkMemLakehouse() pipeline_result = execute_spark_lakehouse_build( tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse ) assert pipeline_result.success # Row ordering varies on 3.5 - compare as dicts assert ( lakehouse.collected_tables['JoinTable'][0].asDict() == Row(id=1, number=2, string='23').asDict() ) # for dagit typed_lakehouse_pipeline = construct_lakehouse_pipeline( name='typed_lakehouse_pipeline', lakehouse_tables=[NumberTable, StringTable, JoinTable], mode_defs=[ ModeDefinition( resource_defs={ 'lakehouse': typed_pyspark_mem_lakehouse, 'spark': spark_session_resource, } ) ], )