Exemplo n.º 1
0
def get_assets():
    """
     // --> b -- \\
    a              d
     \\ --> c -- //

    e --> f
    """
    a = source_asset(path="a")

    @computed_asset(input_assets=[a])
    def b(_):
        pass

    @computed_asset(input_assets=[a])
    def c(_):
        pass

    @computed_asset(input_assets=[b, c])
    def d(_b, _c):
        pass

    e = source_asset(path="e")

    @computed_asset(input_assets=[e])
    def f(_):
        pass

    return a, b, c, d, e, f
Exemplo n.º 2
0
def test_get_computed_asset_solid_def_with_source_deps_multiple_storages(basic_lakehouse):
    source_asset1 = source_asset(storage_key='storage1', path=('a', 'b'))
    source_asset2 = source_asset(storage_key='storage2', path=('a', 'c'))

    @computed_asset(storage_key='storage1', input_assets=[source_asset1, source_asset2])
    def some_asset(source1: int, source2: int) -> int:
        return source1 + source2

    solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, [])
    assert solid_def.required_resource_keys == {'storage1', 'storage2'}
    _assert_input_defs(solid_def, [])
    _assert_output_def(solid_def, some_asset.dagster_type, 'result')
Exemplo n.º 3
0
def test_get_computed_asset_solid_def_with_source_deps_multiple_storages(basic_lakehouse):
    source_asset1 = source_asset(storage_key="storage1", path=("a", "b"))
    source_asset2 = source_asset(storage_key="storage2", path=("a", "c"))

    @computed_asset(storage_key="storage1", input_assets=[source_asset1, source_asset2])
    def some_asset(source1: int, source2: int) -> int:
        return source1 + source2

    solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, [])
    assert solid_def.required_resource_keys == {"storage1", "storage2"}
    _assert_input_defs(solid_def, [])
    _assert_output_def(solid_def, some_asset.dagster_type, "result")
Exemplo n.º 4
0
def test_computed_asset_multiple_deps_list():
    source_asset1 = source_asset(storage_key='filesystem', path=('a', 'b'))
    source_asset2 = source_asset(storage_key='filesystem', path=('a', 'c'))

    @computed_asset(storage_key='filesystem', input_assets=[source_asset1, source_asset2])
    def casset(b_: int, c_: float) -> str:
        return str(b_) + str(c_)

    assert casset.computation
    assert casset.path == ('casset',)
    assert casset.computation.output_in_memory_type == str
    assert set(casset.computation.deps.keys()) == set(['b_', 'c_'])
    assert casset.computation.deps['b_'].in_memory_type == int
    assert casset.computation.deps['b_'].asset == source_asset1
    assert casset.computation.deps['c_'].in_memory_type == float
    assert casset.computation.deps['c_'].asset == source_asset2
Exemplo n.º 5
0
def test_computed_asset_multiple_deps_list():
    source_asset1 = source_asset(storage_key="filesystem", path=("a", "b"))
    source_asset2 = source_asset(storage_key="filesystem", path=("a", "c"))

    @computed_asset(storage_key="filesystem", input_assets=[source_asset1, source_asset2])
    def casset(b_: int, c_: float) -> str:
        return str(b_) + str(c_)

    assert casset.computation
    assert casset.path == ("casset",)
    assert casset.computation.output_in_memory_type == str
    assert set(casset.computation.deps.keys()) == set(["b_", "c_"])
    assert casset.computation.deps["b_"].in_memory_type == int
    assert casset.computation.deps["b_"].asset == source_asset1
    assert casset.computation.deps["c_"].in_memory_type == float
    assert casset.computation.deps["c_"].asset == source_asset2
Exemplo n.º 6
0
def test_build_pipeline_definition_missing_input_policy(basic_lakehouse):
    source_asset1 = source_asset(storage_key='storage1', path=('a', 'b'))

    @computed_asset(storage_key='storage1', input_assets=[source_asset1])
    def some_asset(source: str) -> int:
        return int(source)

    with pytest.raises(CheckError):
        basic_lakehouse.build_pipeline_definition('some_pipeline', [some_asset])
Exemplo n.º 7
0
def test_computed_asset_one_dep():
    source_asset1 = source_asset(storage_key='filesystem', path=('a', 'b'))

    @computed_asset(storage_key='filesystem', input_assets={'a_': source_asset1})
    def casset(a_: int) -> str:
        return str(a_)

    assert casset.computation
    assert casset.path == ('casset',)
    assert casset.computation.output_in_memory_type == str
    assert list(casset.computation.deps.keys()) == ['a_']
    assert casset.computation.deps['a_'].in_memory_type == int
    assert casset.computation.deps['a_'].asset == source_asset1
Exemplo n.º 8
0
def test_computed_asset_one_dep():
    source_asset1 = source_asset(storage_key="filesystem", path=("a", "b"))

    @computed_asset(storage_key="filesystem", input_assets={"a_": source_asset1})
    def casset(a_: int) -> str:
        return str(a_)

    assert casset.computation
    assert casset.path == ("casset",)
    assert casset.computation.output_in_memory_type == str
    assert list(casset.computation.deps.keys()) == ["a_"]
    assert casset.computation.deps["a_"].in_memory_type == int
    assert casset.computation.deps["a_"].asset == source_asset1
Exemplo n.º 9
0
'''Asset definitions for the simple_lakehouse example.'''
import pandas as pd
from lakehouse import computed_asset, source_asset
from pandas import DataFrame as PandasDF
from pyspark.sql import DataFrame as SparkDF
from pyspark.sql import Window
from pyspark.sql import functions as f

sfo_q2_weather_sample_asset = source_asset(storage_key='filesystem',
                                           path=('dagster_examples',
                                                 'simple_lakehouse',
                                                 'sfo_q2_weather_sample'))


@computed_asset(storage_key='filesystem',
                input_assets=[sfo_q2_weather_sample_asset])
def daily_temperature_highs_asset(sfo_q2_weather_sample: PandasDF) -> PandasDF:
    '''Computes the temperature high for each day'''
    sfo_q2_weather_sample['valid_date'] = pd.to_datetime(
        sfo_q2_weather_sample['valid'])
    return sfo_q2_weather_sample.groupby('valid_date').max().rename(
        columns={'tmpf': 'max_tmpf'})


@computed_asset(storage_key='filesystem',
                input_assets=[daily_temperature_highs_asset])
def daily_temperature_high_diffs_asset(
        daily_temperature_highs: SparkDF) -> SparkDF:
    '''Computes the difference between each day's high and the previous day's high'''
    window = Window.orderBy('valid_date')
    return daily_temperature_highs.select(