def dynamic_pipeline(): @solid def multiply_by_two(context, y): context.log.info("multiply_by_two is returning " + str(y * 2)) return y * 2 @solid def multiply_inputs(context, y, ten, should_fail): current_run = context.instance.get_run_by_id(context.run_id) if should_fail: if y == 2 and current_run.parent_run_id is None: raise Exception() context.log.info("multiply_inputs is returning " + str(y * ten)) return y * ten @solid def emit_ten(_): return 10 @solid(output_defs=[DynamicOutputDefinition()]) def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i)) @solid def sum_numbers(_, nums): return sum(nums) # pylint: disable=no-member multiply_by_two.alias("double_total")(sum_numbers( emit().map(lambda n: multiply_by_two(multiply_inputs(n, emit_ten())), ).collect(), ))
def test_fails_with_wrong_output(): @solid(output_defs=[DynamicOutputDefinition()]) def should_fail(_): yield Output(1) with pytest.raises(DagsterInvariantViolationError, match="must yield DynamicOutput"): execute_solid(should_fail) @solid(output_defs=[DynamicOutputDefinition()]) def should_also_fail(_): return 1 with pytest.raises(DagsterInvariantViolationError, match="must yield DynamicOutput"): execute_solid(should_also_fail)
def test_dynamic(gcs_bucket): @solid(output_defs=[DynamicOutputDefinition()]) def numbers(_): for i in range(3): yield DynamicOutput(i, mapping_key=str(i)) @solid def echo(_, x): return x @pipeline(mode_defs=[ ModeDefinition(resource_defs={ "io_manager": gcs_pickle_io_manager, "gcs": gcs_resource }) ]) def dynamic(): numbers().map(echo) result = execute_pipeline(dynamic, run_config={ "resources": { "io_manager": { "config": { "gcs_bucket": gcs_bucket } } } }) assert result.success
def test_solid_outputs_access(): called = {} @success_hook def my_success_hook(context): called[context.step_key] = context.solid_output_values @failure_hook def my_failure_hook(context): called[context.step_key] = context.solid_output_values @solid(output_defs=[ OutputDefinition(name="one"), OutputDefinition(name="two"), OutputDefinition(name="three"), ]) def a_solid(_): yield Output(1, "one") yield Output(2, "two") yield Output(3, "three") @solid(output_defs=[ OutputDefinition(name="one"), OutputDefinition(name="two"), ]) def failed_solid(_): yield Output(1, "one") raise SomeUserException() yield Output(3, "two") # pylint: disable=unreachable @solid(output_defs=[DynamicOutputDefinition()]) def dynamic_solid(_): yield DynamicOutput(1, mapping_key="mapping_1") yield DynamicOutput(2, mapping_key="mapping_2") @solid def echo(_, x): return x @my_success_hook @my_failure_hook @pipeline def a_pipeline(): a_solid() failed_solid() dynamic_solid().map(echo) result = execute_pipeline(a_pipeline, raise_on_error=False) assert not result.success assert called.get("a_solid") == {"one": 1, "two": 2, "three": 3} assert called.get("failed_solid") == {"one": 1} assert called.get("dynamic_solid") == { "result": { "mapping_1": 1, "mapping_2": 2 } } assert called.get("echo[mapping_1]") == {"result": 1} assert called.get("echo[mapping_2]") == {"result": 2}
def test_fails_dupe_keys(): @solid(output_defs=[DynamicOutputDefinition()]) def should_fail(_): yield DynamicOutput(True, mapping_key="dunk") yield DynamicOutput(True, mapping_key="dunk") with pytest.raises(DagsterInvariantViolationError, match='mapping_key "dunk" multiple times'): execute_solid(should_fail)
def test_dynamic_output_solid(): @solid(output_defs=[DynamicOutputDefinition()]) def should_work(_): yield DynamicOutput(1, mapping_key="1") yield DynamicOutput(2, mapping_key="2") result = execute_in_process(should_work) assert result.success assert result.output_values["result"]["1"] == 1 assert result.output_values["result"]["2"] == 2
def test_dynamic_output_definition_single_partition_materialization(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") @solid(output_defs=[ OutputDefinition(name="output1", asset_key=AssetKey("table1")) ]) def solid1(_): return Output(None, "output1", metadata_entries=[entry1]) @solid(output_defs=[ DynamicOutputDefinition( name="output2", asset_key=lambda context: AssetKey(context.mapping_key)) ]) def solid2(_, _input1): for i in range(4): yield DynamicOutput( 7, mapping_key=str(i), output_name="output2", metadata_entries=[entry2], ) @solid def do_nothing(_, _input1): pass @pipeline def my_pipeline(): solid2(solid1()).map(do_nothing) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 5 check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1]) seen_paths = set() for i in range(1, 5): path = materializations[i].asset_key.path seen_paths.add(tuple(path)) check_materialization( materializations[i], AssetKey(path), metadata_entries=[entry2], parent_assets=[AssetLineageInfo(AssetKey(["table1"]))], ) assert len(seen_paths) == 4
def test_dynamic(): @solid(output_defs=[DynamicOutputDefinition(dagster_type=int)]) def dyn_desc(_) -> Iterator[DynamicOutput]: """ Returns: numbers """ yield DynamicOutput(4, "4") assert dyn_desc.output_defs[0].description == "numbers" assert dyn_desc.output_defs[0].is_dynamic
def test_basic(): @solid(output_defs=[DynamicOutputDefinition()]) def should_work(_): yield DynamicOutput(1, mapping_key="1") yield DynamicOutput(2, mapping_key="2") result = execute_solid(should_work) assert result.success assert len(result.get_output_events_for_compute()) == 2 assert len(result.compute_output_events_dict["result"]) == 2 assert result.output_values == {"result": {"1": 1, "2": 2}} assert result.output_value() == {"1": 1, "2": 2}
def test_must_unpack_composite(): with pytest.raises( DagsterInvalidDefinitionError, match="Dynamic output must be unpacked by invoking map", ): @composite_solid(output_defs=[DynamicOutputDefinition()]) def composed(): return dynamic_numbers() @pipeline def _should_fail(): echo(composed())
def test_multi_output(): @solid(output_defs=[ DynamicOutputDefinition(int, "numbers"), DynamicOutputDefinition(str, "letters"), OutputDefinition(str, "wildcard"), ]) def should_work(_): yield DynamicOutput(1, output_name="numbers", mapping_key="1") yield DynamicOutput(2, output_name="numbers", mapping_key="2") yield DynamicOutput("a", output_name="letters", mapping_key="a") yield DynamicOutput("b", output_name="letters", mapping_key="b") yield DynamicOutput("c", output_name="letters", mapping_key="c") yield Output("*", "wildcard") result = execute_solid(should_work) assert result.success assert len(result.get_output_events_for_compute("numbers")) == 2 assert len(result.get_output_events_for_compute("letters")) == 3 assert result.get_output_event_for_compute("wildcard") assert len(result.compute_output_events_dict["numbers"]) == 2 assert len(result.compute_output_events_dict["letters"]) == 3 assert len(result.compute_output_events_dict["wildcard"]) == 1 assert result.output_values == { "numbers": { "1": 1, "2": 2 }, "letters": { "a": "a", "b": "b", "c": "c" }, "wildcard": "*", } assert result.output_value("numbers") == {"1": 1, "2": 2} assert result.output_value("letters") == {"a": "a", "b": "b", "c": "c"} assert result.output_value("wildcard") == "*"
def test_multi_composite_out(): with pytest.raises( DagsterInvalidDefinitionError, match="cannot be downstream of more than one dynamic output", ): @composite_solid(output_defs=[DynamicOutputDefinition()]) def composed_echo(): return dynamic_solid().map(echo) @pipeline def _should_fail(): def _complex(item): composed_echo().map(lambda y: add(y, item)) dynamic_solid().map(_complex)
def test_composite_multi_out(): @composite_solid( output_defs=[OutputDefinition(Any, "one"), DynamicOutputDefinition(Any, "numbers")] ) def multi_out(): one = emit_one() numbers = dynamic_numbers() return {"one": one, "numbers": numbers} @pipeline def composite_multi(): one, numbers = multi_out() echo(one) numbers.map(echo) result = execute_pipeline(composite_multi) assert result.success
def test_temp_fail_on_dep(): # to be removed in upcoming diff @solid(output_defs=[DynamicOutputDefinition()]) def should_work(_): yield DynamicOutput(1, mapping_key="1") yield DynamicOutput(2, mapping_key="2") @solid def echo(_, x): return x with pytest.raises(DagsterInvalidDefinitionError, match="not yet supported"): @pipeline def _uh_oh(): echo(should_work())
def test_direct_dep(): @solid(output_defs=[DynamicOutputDefinition()]) def dynamic_add(_, x): yield DynamicOutput(x + 1, mapping_key="1") yield DynamicOutput(x + 2, mapping_key="2") @pipeline def _is_fine(): def _add(item): dynamic_add(item) dynamic_solid().map(_add) with pytest.raises( DagsterInvalidDefinitionError, match="cannot be downstream of more than one dynamic output", ): @pipeline def _should_fail(): def _add_echo(item): dynamic_add(item).map(echo) dynamic_solid().map(_add_echo) @pipeline def _is_fine(): dynamic_solid().map(dynamic_add) with pytest.raises( DagsterInvalidDefinitionError, match="cannot be downstream of more than one dynamic output", ): @pipeline def _should_fail(): echo(dynamic_solid().map(dynamic_add).collect())
) def fn_save_treated_local(context, df, file_path, mode="staging"): _file_path = file_path.format(mode=mode, filetype="csv") _file_path = Path(_file_path) _file_path.parent.mkdir(parents=True, exist_ok=True) _file_path = str(_file_path) context.log.info(f"Saving df to {_file_path}") df.to_csv(_file_path, index=False) return _file_path @solid( output_defs=[DynamicOutputDefinition(dict)], retry_policy=RetryPolicy(max_retries=3, delay=30), ) def get_runs(context, execution_date): execution_date = datetime.strptime(execution_date, "%Y-%m-%d") now = execution_date + timedelta(hours=11, minutes=30) this_time_yesterday = now - timedelta(days=1) min_timestamp = convert_datetime_to_unix_time(this_time_yesterday) max_timestamp = convert_datetime_to_unix_time(now) context.log.info(f"{execution_date} of type {type(execution_date)}") ftp_client = connect_ftp(os.getenv("FTPS_HOST"), os.getenv("FTPS_USERNAME"), os.getenv("FTPS_PWD")) # Change to working directory ftp_client.cwd("/") for folder in ftp_client.mlsd():
context.log.info("Success!") else: context.log.info("View not found, skipping...") rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) except Exception as e: try: materialization_lock.release() except: pass raise e @solid( retry_policy=RetryPolicy(max_retries=3, delay=5), output_defs=[DynamicOutputDefinition(dict)], ) def update_managed_views( context, blob_names, materialization_locked: bool, materialization_lock: Redlock, ): try: # Setup Redis and Redlock r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) views_lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value,
@solid def save_blob_to_tempfile(context, blob_path: str, bucket_name: str) -> str: tempfile_name: str = f"/tmp/{uuid4()}.zip" context.log.debug( f"Saving {blob_path} to temporary file with name {tempfile_name}") blob: Blob = get_blob(blob_path, bucket_name, mode="staging") with open(tempfile_name, "wb") as tempfile: tempfile.write(blob.download_as_bytes()) tempfile.close() return tempfile_name @solid( output_defs=[ DynamicOutputDefinition(name="filename"), ], ) def get_gtfs_files(context, original_filepath): feed_files = gk.list_feed(original_filepath)['file_name'] for item in feed_files: filename = Path(item).stem yield DynamicOutput(filename, mapping_key=filename, output_name='filename') @solid def create_gtfs_version_partition(context, feed, original_filepath, bucket_name): # If feed_info.txt is available, use GTFS version as partition if feed.feed_info is not None:
from typing import Iterator from dagster import Any, Field, String, solid from dagster.core.execution.context.compute import AbstractComputeExecutionContext from dagster.experimental import DynamicOutput, DynamicOutputDefinition from hca_orchestration.support.typing import HcaScratchDatasetName, MetadataTypeFanoutResult from hca_manage.common import JobId @solid(config_schema={ "metadata_types": Field(Any, is_required=True), "prefix": Field(str, is_required=True) }, output_defs=[ DynamicOutputDefinition(name="table_fanout_result", dagster_type=MetadataTypeFanoutResult) ]) def ingest_metadata_type( context: AbstractComputeExecutionContext, result: list[JobId], scratch_dataset_name: HcaScratchDatasetName ) -> Iterator[MetadataTypeFanoutResult]: """ For each metadata type, return a dynamic output over which we can later map This saves us from hardcoding solids for each type """ for metadata_type in context.solid_config["metadata_types"]: yield DynamicOutput(value=MetadataTypeFanoutResult( scratch_dataset_name, metadata_type.value, context.solid_config["prefix"]), mapping_key=metadata_type.value, output_name="table_fanout_result")
def test_tags_to_dynamic_plan(): @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "500m", "memory": "128Mi" }, "limits": { "cpu": "1000m", "memory": "1Gi" }, } } } }) def multiply_inputs(_, x): return 2 * x @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "250m", "memory": "64Mi" }, "limits": { "cpu": "500m", "memory": "2560Mi" }, } } } }, output_defs=[DynamicOutputDefinition()], ) def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i)) @pipeline def k8s_ready(): return emit().map(multiply_inputs) known_state = KnownExecutionState( {}, { emit.name: { "result": ["0", "1", "2"] }, }, ) plan = create_execution_plan(k8s_ready, known_state=known_state) emit_step = plan.get_step_by_key(emit.name) user_defined_k8s_config = get_user_defined_k8s_config(emit_step.tags) assert user_defined_k8s_config.container_config assert user_defined_k8s_config.container_config["resources"] resources = user_defined_k8s_config.container_config["resources"] assert resources["requests"]["cpu"] == "250m" assert resources["requests"]["memory"] == "64Mi" assert resources["limits"]["cpu"] == "500m" assert resources["limits"]["memory"] == "2560Mi" for mapping_key in range(3): multiply_inputs_step = plan.get_step_by_key( f"{multiply_inputs.name}[{mapping_key}]") dynamic_step_user_defined_k8s_config = get_user_defined_k8s_config( multiply_inputs_step.tags) assert dynamic_step_user_defined_k8s_config.container_config assert dynamic_step_user_defined_k8s_config.container_config[ "resources"] resources = dynamic_step_user_defined_k8s_config.container_config[ "resources"] assert resources["requests"]["cpu"] == "500m" assert resources["requests"]["memory"] == "128Mi" assert resources["limits"]["cpu"] == "1000m" assert resources["limits"]["memory"] == "1Gi"
@solid def multiply_inputs(context, y, ten): # current_run = context.instance.get_run_by_id(context.run_id) # if y == 2 and current_run.parent_run_id is None: # raise Exception() context.log.info("multiply_inputs is returning " + str(y * ten)) return y * ten @solid def emit_ten(_): return 10 @solid(output_defs=[DynamicOutputDefinition()]) def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i)) @pipeline def dynamic_pipeline(): # pylint: disable=no-member emit().map(lambda n: multiply_by_two(multiply_inputs(n, emit_ten()))) def test_map(): result = execute_pipeline( dynamic_pipeline, )
# start_marker import os from typing import List from dagster import Field, pipeline, solid from dagster.experimental import DynamicOutput, DynamicOutputDefinition from dagster.utils import file_relative_path @solid( config_schema={ "path": Field(str, default_value=file_relative_path(__file__, "sample")) }, output_defs=[DynamicOutputDefinition(str)], ) def files_in_directory(context): path = context.solid_config["path"] dirname, _, filenames = next(os.walk(path)) for file in filenames: yield DynamicOutput( value=os.path.join(dirname, file), # create a mapping key from the file name mapping_key=file.replace(".", "_").replace("-", "_"), ) @solid def process_file(path: str) -> int: # simple example of calculating size