Exemplo n.º 1
0
def make_simple_lakehouse():
    dev_mode = ModeDefinition(
        name="dev",
        resource_defs={
            "pyspark": pyspark_resource,
            "filesystem": local_file_system_storage.configured({"root": "."}),
        },
    )

    prod_mode = ModeDefinition(
        name="prod",
        resource_defs={
            "pyspark":
            pyspark_resource,
            "filesystem":
            s3_storage.configured({
                "bucket": "some_bucket",
                "prefix": "some_prefix"
            }),
        },
    )

    return Lakehouse(
        mode_defs=[dev_mode, prod_mode],
        in_memory_type_resource_keys={SparkDF: ["pyspark"]},
    )
Exemplo n.º 2
0
def make_simple_lakehouse():
    dev_mode = ModeDefinition(
        name='dev',
        resource_defs={'pyspark': pyspark_resource, 'filesystem': local_file_system_storage},
    )
    dev = PresetDefinition(
        name='dev',
        mode='dev',
        run_config={'resources': {'filesystem': {'config': {'root': '.'}}}},
        solid_selection=None,
    )

    prod_mode = ModeDefinition(
        name='prod', resource_defs={'pyspark': pyspark_resource, 'filesystem': s3_storage},
    )
    prod = PresetDefinition(
        name='prod',
        mode='prod',
        run_config={'resources': {'filesystem': {'config': {'root': '.'}}}},
        solid_selection=None,
    )

    return Lakehouse(
        preset_defs=[dev, prod],
        mode_defs=[dev_mode, prod_mode],
        in_memory_type_resource_keys={SparkDF: ['pyspark']},
        type_storage_policies=[
            SparkDFLocalFileSystemPolicy,
            PandasDFLocalFileSystemPolicy,
            SparkDFS3Policy,
            PandasDFS3Policy,
        ],
    )
Exemplo n.º 3
0
def basic_lakehouse_and_storages():
    storage1 = DictStorage()
    storage2 = DictStorage()

    @resource()
    def some_storage(_):
        return storage1

    @resource()
    def some_other_storage(_):
        return storage2

    dev_mode = ModeDefinition(
        name="dev",
        resource_defs={
            "storage1": some_storage,
            "storage2": some_other_storage
        },
    )
    dev_preset = PresetDefinition(name="dev",
                                  mode="dev",
                                  run_config={},
                                  solid_selection=None)

    return (
        Lakehouse(mode_defs=[dev_mode], preset_defs=[dev_preset]),
        storage1,
        storage2,
    )
Exemplo n.º 4
0
def make_multi_type_lakehouse():
    dev_mode = ModeDefinition(
        resource_defs={
            "pyspark": pyspark_resource,
            "default_storage": local_file_system_storage.configured({"root": "."}),
        },
    )

    return Lakehouse(mode_defs=[dev_mode], in_memory_type_resource_keys={SparkDF: ["pyspark"]},)
Exemplo n.º 5
0
def make_simple_lakehouse():
    dev_mode = ModeDefinition(
        name='dev',
        resource_defs={
            'filesystem':
            pandas_df_local_filesystem_storage.configured({'root': '.'}),
        },
    )

    return Lakehouse(mode_defs=[dev_mode])
Exemplo n.º 6
0
def make_simple_lakehouse():
    dev_mode = ModeDefinition(
        name='dev',
        resource_defs={
            'pyspark': pyspark_resource,
            'filesystem': local_file_system_storage.configured({'root': '.'}),
        },
    )

    prod_mode = ModeDefinition(
        name='prod',
        resource_defs={
            'pyspark': pyspark_resource,
            'filesystem': s3_storage.configured({'bucket': 'some_bucket', 'prefix': 'some_prefix'}),
        },
    )

    return Lakehouse(
        mode_defs=[dev_mode, prod_mode], in_memory_type_resource_keys={SparkDF: ['pyspark']},
    )
Exemplo n.º 7
0
def basic_lakehouse_and_storages():
    class DictStorage(AssetStorage):
        def __init__(self):
            self.the_dict = {}

        def save(self, obj, path, _resources):
            self.the_dict[path] = obj

        def load(self, _python_type, path, _resources):
            return self.the_dict[path]

    storage1 = DictStorage()
    storage2 = DictStorage()

    @asset_storage()
    def some_storage(_):
        return storage1

    @asset_storage()
    def some_other_storage(_):
        return storage2

    dev_mode = ModeDefinition(
        name="dev",
        resource_defs={
            "storage1": some_storage,
            "storage2": some_other_storage
        },
    )
    dev_preset = PresetDefinition(name="dev",
                                  mode="dev",
                                  run_config={},
                                  solid_selection=None)

    return (
        Lakehouse(mode_defs=[dev_mode], preset_defs=[dev_preset]),
        storage1,
        storage2,
    )
Exemplo n.º 8
0
from dagster import ModeDefinition, resource
from lakehouse import Lakehouse, computed_asset
from lakehouse_tests.conftest import DictStorage


@computed_asset()
def asset1():
    pass


@computed_asset(input_assets=[asset1])
def asset2(_):
    pass


@resource()
def a_storage(_):
    return DictStorage()


lakehouse_def = Lakehouse(
    mode_defs=[
        ModeDefinition(name="dev",
                       resource_defs={"default_storage": a_storage})
    ],
    assets=[asset1, asset2],
)
Exemplo n.º 9
0
def basic_lakehouse_and_storages():
    class DictStorage:
        def __init__(self):
            self.the_dict = {}

    storage1 = DictStorage()
    storage2 = DictStorage()

    @resource
    def some_storage(_):
        return storage1

    @resource
    def some_other_storage(_):
        return storage2

    dev_mode = ModeDefinition(
        name='dev',
        resource_defs={
            'storage1': some_storage,
            'storage2': some_other_storage,
        },
    )
    dev_preset = PresetDefinition(
        name='dev',
        mode='dev',
        run_config={},
        solid_selection=None,
    )

    class IntSomeStoragePolicy(TypeStoragePolicy):
        @classmethod
        def in_memory_type(cls):
            return int

        @classmethod
        def storage_definition(cls):
            return some_storage

        @classmethod
        def save(cls, obj, storage, path, _resources):
            storage.the_dict[path] = obj

        @classmethod
        def load(cls, storage, path, _resources):
            return storage.the_dict[path]

    class IntSomeOtherStoragePolicy(TypeStoragePolicy):
        @classmethod
        def in_memory_type(cls):
            return int

        @classmethod
        def storage_definition(cls):
            return some_other_storage

        @classmethod
        def save(cls, obj, storage, path, _resources):
            storage.the_dict[path] = obj

        @classmethod
        def load(cls, storage, path, _resources):
            return storage.the_dict[path]

    return (
        Lakehouse(
            mode_defs=[dev_mode],
            preset_defs=[dev_preset],
            type_storage_policies=[
                IntSomeStoragePolicy, IntSomeOtherStoragePolicy
            ],
        ),
        storage1,
        storage2,
    )
Exemplo n.º 10
0
    def __init__(self, root):
        self._root = root

    def _get_fs_path(self, path: Tuple[str, ...]) -> str:
        rpath = os.path.join(self._root, *path) + ".csv"
        return os.path.abspath(rpath)

    def save(self, obj: pd.DataFrame, path: Tuple[str, ...], _resources) -> None:
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(path)
        obj.to_csv(fpath)

    def load(self, _python_type, path: Tuple[str, ...], _resources):
        """This reads a dataframe from a CSV."""
        fpath = self._get_fs_path(path)
        return pd.read_csv(fpath)


@resource(config_schema={"root": StringSource})
def local_fs_storage(init_context):
    return LocalFileSystemStorage(init_context.resource_config["root"])


simple_lakehouse = Lakehouse(
    mode_defs=[
        ModeDefinition(
            resource_defs={"default_storage": local_fs_storage.configured({"root": "."})},
        )
    ]
)