Exemplo n.º 1
0
def scope_dbt_cli_profile_modes():
    # start_marker_dbt_cli_profile_modes
    from dagster_dbt import dbt_cli_resource, dbt_run_op

    from dagster import graph

    @graph
    def my_dbt():
        dbt_run_op()

    my_dbt_graph_dev = my_dbt.to_job(
        resource_defs={
            "dbt":
            dbt_cli_resource.configured({
                "project_dir": "path/to/dbt/project",
                "profile": "dev"
            })
        })

    my_dbt_graph_prod = my_dbt.to_job(
        resource_defs={
            "dbt":
            dbt_cli_resource.configured({
                "project_dir": "path/to/dbt/project",
                "profile": "prod"
            })
        })
Exemplo n.º 2
0
def scope_dbt_cli_profile_modes():
    # start_marker_dbt_cli_profile_modes
    from dagster import pipeline, solid, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    @solid(required_resource_keys={"dbt"})
    def run_all_models(context):
        context.resources.dbt.run()

    @pipeline(mode_defs=[
        ModeDefinition(
            "dev",
            resource_defs={
                "dbt":
                dbt_cli_resource.configured({
                    "project_dir": "path/to/dbt/project",
                    "profile": "dev"
                })
            },
        ),
        ModeDefinition(
            "prod",
            resource_defs={
                "dbt":
                dbt_cli_resource.configured({
                    "project_dir": "path/to/dbt/project",
                    "profile": "prod"
                })
            },
        ),
    ])
    def my_dbt_pipeline():
        run_all_models()
Exemplo n.º 3
0
def get_dbt_resource(project_dir, profiles_dir, **kwargs):
    kwargs = kwargs or {}
    return dbt_cli_resource.configured({
        "project_dir": project_dir,
        "profiles_dir": profiles_dir,
        **kwargs,
    })
Exemplo n.º 4
0
def test_seed_op(conn_string, test_project_dir, dbt_config_dir):  # pylint: disable=unused-argument

    dbt_resource = dbt_cli_resource.configured(
        {"project_dir": test_project_dir, "profiles_dir": dbt_config_dir}
    )
    dbt_result = dbt_seed_op(build_op_context(resources={"dbt": dbt_resource}))
    assert len(dbt_result.result["results"]) == 1
Exemplo n.º 5
0
def test_select_from_project(dbt_seed, conn_string, test_project_dir,
                             dbt_config_dir):  # pylint: disable=unused-argument

    dbt_assets = load_assets_from_dbt_project(
        test_project_dir,
        dbt_config_dir,
        select="sort_by_calories subdir.least_caloric")

    assert dbt_assets[0].op.name == "run_dbt_dagster_dbt_test_project_e4753"

    result = build_assets_job(
        "test_job",
        dbt_assets,
        resource_defs={
            "dbt":
            dbt_cli_resource.configured({
                "project_dir": test_project_dir,
                "profiles_dir": dbt_config_dir
            })
        },
    ).execute_in_process()

    assert result.success
    materializations = [
        event.event_specific_data.materialization
        for event in result.events_for_node(dbt_assets[0].op.name)
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2
Exemplo n.º 6
0
def test_node_info_to_asset_key(dbt_seed, conn_string, test_project_dir,
                                dbt_config_dir):  # pylint: disable=unused-argument
    dbt_assets = load_assets_from_dbt_project(
        test_project_dir,
        dbt_config_dir,
        node_info_to_asset_key=lambda node_info: AssetKey(
            ["foo", node_info["name"]]),
    )

    result = build_assets_job(
        "test_job",
        dbt_assets,
        resource_defs={
            "dbt":
            dbt_cli_resource.configured({
                "project_dir": test_project_dir,
                "profiles_dir": dbt_config_dir
            })
        },
    ).execute_in_process()

    assert result.success
    materializations = [
        event.event_specific_data.materialization
        for event in result.events_for_node(dbt_assets[0].op.name)
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 4
    assert materializations[0].asset_key == AssetKey(
        ["foo", "sort_by_calories"])
Exemplo n.º 7
0
def test_select_from_manifest(dbt_seed, conn_string, test_project_dir,
                              dbt_config_dir):  # pylint: disable=unused-argument

    manifest_path = file_relative_path(__file__, "sample_manifest.json")
    with open(manifest_path, "r") as f:
        manifest_json = json.load(f)
    dbt_assets = load_assets_from_dbt_manifest(
        manifest_json,
        selected_unique_ids={
            "model.dagster_dbt_test_project.sort_by_calories",
            "model.dagster_dbt_test_project.least_caloric",
        },
    )

    result = build_assets_job(
        "test_job",
        dbt_assets,
        resource_defs={
            "dbt":
            dbt_cli_resource.configured({
                "project_dir": test_project_dir,
                "profiles_dir": dbt_config_dir
            })
        },
    ).execute_in_process()

    assert result.success
    materializations = [
        event.event_specific_data.materialization
        for event in result.events_for_node(dbt_assets[0].op.name)
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2
Exemplo n.º 8
0
def scope_dbt_cli_resource_config():
    # start_marker_dbt_cli_resource_config
    from dagster_dbt import dbt_cli_resource

    my_dbt_resource = dbt_cli_resource.configured({
        "project_dir":
        "path/to/dbt/project",
        "profiles_dir":
        "path/to/dbt/profiles"
    })
Exemplo n.º 9
0
def scope_dbt_cli_run():
    # start_marker_dbt_cli_run_preconfig
    from dagster import job
    from dagster_dbt import dbt_cli_resource, dbt_run_op

    my_dbt_resource = dbt_cli_resource.configured({"project_dir": "path/to/dbt/project"})

    @job(resource_defs={"dbt": my_dbt_resource})
    def my_dbt_job():
        dbt_run_op()
Exemplo n.º 10
0
def scope_dbt_cli_run_after_another_op():
    # start_marker_dbt_cli_run_after_another_op
    from dagster_dbt import dbt_cli_resource, dbt_run_op, dbt_test_op

    from dagster import job

    my_dbt_resource = dbt_cli_resource.configured(
        {"project_dir": "path/to/dbt/project"})

    @job(resource_defs={"dbt": my_dbt_resource})
    def my_dbt_job():
        dbt_test_op(start_after=dbt_run_op())
Exemplo n.º 11
0
def scope_dbt_cli_config_vars():
    # start_marker_dbt_cli_config_vars
    from dagster import job
    from dagster_dbt import dbt_cli_resource

    config = {"vars": {"key": "value"}}

    @job(resource_defs={"dbt": dbt_cli_resource.configured(config)})
    def my_job():
        # ...
        # end_marker_dbt_cli_config_vars
        pass
Exemplo n.º 12
0
def scope_dbt_cli_config_exclude_models():
    # start_marker_dbt_cli_config_exclude_models
    from dagster import job
    from dagster_dbt import dbt_cli_resource

    config = {"exclude": ["my_dbt_model+", "path.to.models", "tag:nightly"]}

    @job(resource_defs={"dbt": dbt_cli_resource.configured(config)})
    def my_job():
        # ...
        # end_marker_dbt_cli_config_exclude_models
        pass
Exemplo n.º 13
0
def scope_dbt_cli_config_executable():
    # start_marker_dbt_cli_config_executable
    from dagster import job
    from dagster_dbt import dbt_cli_resource

    config = {"dbt_executable": "path/to/dbt/executable"}

    @job(resource_defs={"dbt": dbt_cli_resource.configured(config)})
    def my_job():
        # ...
        # end_marker_dbt_cli_config_executable
        pass
Exemplo n.º 14
0
def scope_dbt_cli_run_specific_models():
    # start_marker_dbt_cli_run_specific_models_preconfig
    from dagster import job
    from dagster_dbt import dbt_cli_resource, dbt_run_op

    my_dbt_resource = dbt_cli_resource.configured(
        {"project_dir": "path/to/dbt/project", "models": ["tag:staging"]}
    )

    @job(resource_defs={"dbt": my_dbt_resource})
    def my_dbt_job():
        dbt_run_op()
Exemplo n.º 15
0
def test_run_op(
    dbt_seed, conn_string, test_project_dir, dbt_config_dir
):  # pylint: disable=unused-argument

    dbt_resource = dbt_cli_resource.configured(
        {"project_dir": test_project_dir, "profiles_dir": dbt_config_dir}
    )
    dbt_results = list(dbt_run_op(build_op_context(resources={"dbt": dbt_resource})))

    # includes asset materializations
    assert len(dbt_results) == 5

    assert len(dbt_results[-1].value.result["results"]) == 4
Exemplo n.º 16
0
def scope_dbt_cli_config_profile_and_target():
    PROFILE_NAME, TARGET_NAME = "", ""

    # start_marker_dbt_cli_config_profile_and_target
    from dagster import job
    from dagster_dbt import dbt_cli_resource

    config = {"profile": PROFILE_NAME, "target": TARGET_NAME}

    @job(resource_defs={"dbt": dbt_cli_resource.configured(config)})
    def my_job():
        # ...
        # end_marker_dbt_cli_config_profile_and_target
        pass
Exemplo n.º 17
0
def scope_dbt_cli_config_exclude_models():
    # start_marker_dbt_cli_config_exclude_models
    from dagster import pipeline, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    config = {"exclude": ["my_dbt_model+", "path.to.models", "tag:nightly"]}

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={"dbt": dbt_cli_resource.configured(config)})
    ])
    def my_pipeline():
        # ...
        # end_marker_dbt_cli_config_exclude_models
        pass
Exemplo n.º 18
0
def scope_dbt_cli_config_vars():
    # start_marker_dbt_cli_config_vars
    from dagster import pipeline, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    config = {"vars": {"key": "value"}}

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={"dbt": dbt_cli_resource.configured(config)})
    ])
    def my_pipeline():
        # ...
        # end_marker_dbt_cli_config_vars
        pass
Exemplo n.º 19
0
def scope_dbt_cli_config_executable():
    # start_marker_dbt_cli_config_executable
    from dagster import pipeline, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    config = {"dbt_executable": "path/to/dbt/executable"}

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={"dbt": dbt_cli_resource.configured(config)})
    ])
    def my_pipeline():
        # ...
        # end_marker_dbt_cli_config_executable
        pass
Exemplo n.º 20
0
def scope_dbt_cli_run():
    # start_marker_dbt_cli_run_preconfig
    from dagster import pipeline, solid, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    my_dbt_resource = dbt_cli_resource.configured(
        {"project_dir": "path/to/dbt/project"})

    @solid(required_resource_keys={"dbt"})
    def run_all_models(context):
        context.resources.dbt.run()

    @pipeline(
        mode_defs=[ModeDefinition(resource_defs={"dbt": my_dbt_resource})])
    def my_dbt_pipeline():
        run_all_models()
Exemplo n.º 21
0
def scope_dbt_cli_config_profile_and_target():
    PROFILE_NAME, TARGET_NAME = "", ""

    # start_marker_dbt_cli_config_profile_and_target
    from dagster import pipeline, ModeDefinition
    from dagster_dbt import dbt_cli_resource

    config = {"profile": PROFILE_NAME, "target": TARGET_NAME}

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={"dbt": dbt_cli_resource.configured(config)})
    ])
    def my_pipeline():
        # ...
        # end_marker_dbt_cli_config_profile_and_target
        pass
Exemplo n.º 22
0
def test_run_test_job(dbt_seed, conn_string, test_project_dir, dbt_config_dir):  # pylint: disable=unused-argument

    dbt_resource = dbt_cli_resource.configured({
        "project_dir": test_project_dir,
        "profiles_dir": dbt_config_dir
    })

    @job(resource_defs={"dbt": dbt_resource})
    def run_test_job():
        dbt_test_op(start_after=dbt_run_op())

    dbt_result = run_test_job.execute_in_process()

    dbt_run_result = dbt_result.output_for_node("dbt_run_op")
    dbt_test_result = dbt_result.output_for_node("dbt_test_op")

    assert len(dbt_run_result.result["results"]) == 4
    assert len(dbt_test_result.result["results"]) == 15
Exemplo n.º 23
0
def scope_dbt_cli_run_after_another_solid():
    # start_marker_dbt_cli_run_after_another_solid
    from dagster import pipeline, solid, ModeDefinition
    from dagster_dbt import dbt_cli_resource, DbtCliOutput

    my_dbt_resource = dbt_cli_resource.configured(
        {"project_dir": "path/to/dbt/project"})

    @solid(required_resource_keys={"dbt"})
    def run_models(context) -> DbtCliOutput:
        return context.resources.dbt.run()

    @solid(required_resource_keys={"dbt"})
    def test_models(context, run_result: DbtCliOutput):
        context.log.info(f"testing result of `{run_result.command}`!")
        context.resources.dbt.test()

    @pipeline(
        mode_defs=[ModeDefinition(resource_defs={"dbt": my_dbt_resource})])
    def my_dbt_pipeline():
        run_result = run_models()
        test_models(run_result)
Exemplo n.º 24
0
    SHARED_SNOWFLAKE_CONF,
    connect_snowflake,
    snowflake_io_manager_dev,
    snowflake_io_manager_prod,
)

DBT_PROJECT_DIR = file_relative_path(__file__, "../../hacker_news_dbt")
DBT_PROFILES_DIR = DBT_PROJECT_DIR + "/config"

# We define two sets of resources, one for the prod mode, which writes to production schemas and
# one for dev mode, which writes to alternate schemas
PROD_RESOURCES = {
    "dbt":
    dbt_cli_resource.configured({
        "profiles_dir": DBT_PROFILES_DIR,
        "project_dir": DBT_PROJECT_DIR,
        "target": "prod"
    }),
    "warehouse_io_manager":
    snowflake_io_manager_prod,
    # "parquet_io_manager": parquet_io_manager.configured({"base_path": get_system_temp_directory()}),
    "pyspark":
    pyspark_resource,
}

DEV_RESOURCES = {
    "dbt":
    dbt_cli_resource.configured({
        "profiles-dir": DBT_PROFILES_DIR,
        "project-dir": DBT_PROJECT_DIR,
        "target": "dev"
Exemplo n.º 25
0
)
from ops.gdelt_mining_ops import enhance_articles, materialize_gdelt_mining_asset, materialize_enhanced_articles_asset
from ops.ml_enrichment_ops import classify_protest_relevancy, get_ml_enrichment_files, store_ml_enrichment_files
from resources.novacene_ml_resource import novacene_ml_api_client


# Resources
#################
DBT_PROFILES_DIR = file_relative_path(__file__, "./dw")
DBT_PROJECT_DIR = file_relative_path(__file__, "./dw")

snowflake_env_variables = config_from_files(['environments/snowflake_env_variables.yaml'])
novacene_env_variables = config_from_files(['environments/novacene_env_variables.yaml'])

my_dbt_resource = dbt_cli_resource.configured({
    "profiles_dir": DBT_PROFILES_DIR, 
    "project_dir": DBT_PROJECT_DIR})

my_novacene_client_client = novacene_ml_api_client.configured(novacene_env_variables)


#Jobs
################
@job(
    resource_defs = {
        'snowflake': snowflake_resource
    },
    config = snowflake_env_variables
)
def mine_gdelt_data():
    # Mine data from GDELT
Exemplo n.º 26
0
        optimize.curve_fit(f=model_func,
                           xdata=df.order_date.astype(np.int64),
                           ydata=df.num_orders,
                           p0=[10, 100])[0])


@asset(compute_kind="python", io_manager_key="pandas_io_manager")
def predicted_orders(
        daily_order_summary: pd.DataFrame,
        order_forecast_model: Tuple[float, float]) -> pd.DataFrame:
    """Predicted orders for the next 30 days based on the fit paramters"""
    a, b = order_forecast_model
    start_date = daily_order_summary.order_date.max()
    future_dates = pd.date_range(start=start_date,
                                 end=start_date + pd.DateOffset(days=30))
    predicted_data = model_func(x=future_dates.astype(np.int64), a=a, b=b)
    return pd.DataFrame({
        "order_date": future_dates,
        "num_orders": predicted_data
    })


analytics_assets = AssetGroup(
    airbyte_assets + dbt_assets + [order_forecast_model, predicted_orders],
    resource_defs={
        "airbyte": airbyte_resource.configured(AIRBYTE_CONFIG),
        "dbt": dbt_cli_resource.configured(DBT_CONFIG),
        "pandas_io_manager": pandas_io_manager.configured(PANDAS_IO_CONFIG),
    },
).build_job("Assets")
Exemplo n.º 27
0
import pandas as pd
from dagster import MetadataValue, build_assets_job
from dagster.utils import file_relative_path
from dagster_dbt import dbt_cli_resource
from dagster_dbt.asset_defs import load_assets_from_dbt_manifest
from hacker_news_assets.resources import RESOURCES_PROD, RESOURCES_STAGING
from hacker_news_assets.resources.snowflake_io_manager import (
    SHARED_SNOWFLAKE_CONF,
    connect_snowflake,
)

DBT_PROJECT_DIR = file_relative_path(__file__, "../../hacker_news_dbt")
DBT_PROFILES_DIR = DBT_PROJECT_DIR + "/config"
dbt_staging_resource = dbt_cli_resource.configured({
    "profiles-dir": DBT_PROFILES_DIR,
    "project-dir": DBT_PROJECT_DIR,
    "target": "staging"
})
dbt_prod_resource = dbt_cli_resource.configured({
    "profiles_dir": DBT_PROFILES_DIR,
    "project_dir": DBT_PROJECT_DIR,
    "target": "prod"
})


def asset_metadata(_context, model_info):
    config = dict(SHARED_SNOWFLAKE_CONF)
    config["schema"] = model_info["schema"]
    with connect_snowflake(config=config) as con:
        df = pd.read_sql(f"SELECT * FROM {model_info['name']} LIMIT 5",
                         con=con)