예제 #1
0
def test_basic_sqlite_pipeline():
    @sqlite_table
    def TableOne(context):
        context.resources.conn.execute('''CREATE TABLE TableOne AS SELECT 1 as num''')
        context.resources.conn.commit()

    @sqlite_table
    def TableTwo(context):
        context.resources.conn.execute('''CREATE TABLE TableTwo AS SELECT 2 as num''')
        context.resources.conn.commit()

    @sqlite_table(
        input_tables=[input_table('table_one', TableOne), input_table('table_two', TableTwo)]
    )
    def TableThree(context, **_kwargs):
        context.resources.conn.execute(
            'CREATE TABLE TableThree AS SELECT num from TableOne UNION SELECT num from TableTwo'
        )
        context.resources.conn.commit()

    conn = sqlite3.connect(':memory:')
    pipeline_def = construct_lakehouse_pipeline(
        name='sqllite_lakehouse_pipeline',
        lakehouse_tables=[TableOne, TableTwo, TableThree],
        resources={'conn': conn, 'lakehouse': SqlLiteLakehouse()},
    )

    result = execute_pipeline(pipeline_def)
    assert result.success

    assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [(1,), (2,)]
예제 #2
0
def test_file_based_sqlite_pipeline():
    def path_for_table(table_name):
        return file_relative_path(
            __file__, 'basic_sqllite_test_files/{table_name}.sql'.format(
                table_name=table_name))

    TableOne = create_sqllite_table_from_file(path_for_table('TableOne'))
    TableTwo = create_sqllite_table_from_file(path_for_table('TableTwo'))
    TableThree = create_sqllite_table_from_file(
        path_for_table('TableThree'),
        input_tables=[
            input_table('table_one', TableOne),
            input_table('table_two', TableTwo)
        ],
    )

    conn = sqlite3.connect(':memory:')
    pipeline_def = construct_lakehouse_pipeline(
        name='sqllite_lakehouse_pipeline',
        lakehouse_tables=[TableOne, TableTwo, TableThree],
        resources={
            'conn': conn,
            'lakehouse': SqlLiteLakehouse()
        },
    )

    result = execute_pipeline(pipeline_def)
    assert result.success

    assert conn.cursor().execute('SELECT * FROM TableThree').fetchall() == [
        (1, ), (2, )
    ]
예제 #3
0
FEATURE_TWO = 'feature_two'


@this_pyspark_table(feature_area=FEATURE_ONE)
def TableOne(context) -> SparkDF:
    return context.resources.spark.createDataFrame([Row(num=1)])


@this_pyspark_table(feature_area=FEATURE_ONE)
def TableTwo(context) -> SparkDF:
    return context.resources.spark.createDataFrame([Row(num=2)])


@this_pyspark_table(
    input_tables=[
        input_table('table_one', TableOne),
        input_table('table_two', TableTwo)
    ],
    feature_area=FEATURE_TWO,
)
def TableThree(_, table_one: SparkDF, table_two: SparkDF) -> SparkDF:
    return table_one.union(table_two)


class ByFeatureParquetLakehouse(Lakehouse):
    def __init__(self, root_dir):
        self.lakehouse_path = check.str_param(root_dir, 'root_dir')

    def _path_for_table(self, table_type, table_metadata):
        return os.path.join(self.lakehouse_path, table_metadata[FEATURE_AREA],
                            table_type.name)
예제 #4
0
STRING_TABLE_STRUCT_TYPE = spark_type_from_kwargs(id=int, string=str)


@typed_pyspark_table(spark_type=STRING_TABLE_STRUCT_TYPE)
def StringTable(context) -> SparkDF:
    return context.resources.spark.createDataFrame([Row(id=1, string='23')],
                                                   STRING_TABLE_STRUCT_TYPE)


JOIN_TABLE_STRUCT_TYPE = spark_type_from_kwargs(id=int, number=int, string=str)


@typed_pyspark_table(
    input_tables=[
        input_table('number_df', NumberTable),
        input_table('string_df', StringTable)
    ],
    spark_type=JOIN_TABLE_STRUCT_TYPE,
    description='Joining together of the number and the string.',
)
def JoinTable(_context, number_df: NumberTable,
              string_df: StringTable) -> SparkDF:
    return number_df.join(string_df, number_df.id == string_df.id,
                          'inner').drop(string_df.id)


def test_execute_typed_in_mem_lakehouse():
    lakehouse = TypedPySparkMemLakehouse()
    pipeline_result = execute_spark_lakehouse_build(
        tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse)