@solid def nonce_solid(_): return @pipeline def nonce_pipeline(): return nonce_solid() nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot() nonce_execution_plan_snapshot = snapshot_from_execution_plan( create_execution_plan(nonce_pipeline), nonce_pipeline.get_pipeline_snapshot_id() ) def test_init_modified_docker_operator(dagster_docker_image): dagster_operator_parameters = DagsterOperatorParameters( task_id='nonce', run_config={'storage': {'filesystem': {}}}, pipeline_name='', mode='default', op_kwargs={'image': dagster_docker_image, 'api_version': 'auto',}, pipeline_snapshot=nonce_pipeline_snapshot, execution_plan_snapshot=nonce_execution_plan_snapshot, ) DagsterDockerOperator(dagster_operator_parameters)
def test_gcs_pickle_io_manager_execution(gcs_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "io_manager": { "config": { "gcs_bucket": gcs_bucket, } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client()) step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("return_one"), ), log_manager=DagsterLogManager(run_id=pipeline_run.run_id, logging_tags={}, loggers=[]), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("add_one"), ), log_manager=DagsterLogManager(run_id=pipeline_run.run_id, logging_tags={}, loggers=[]), ) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def _make_airflow_dag( recon_repo, pipeline_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(recon_repo, "recon_repo", ReconstructableRepository) check.str_param(pipeline_name, "pipeline_name") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) mode = check.opt_str_param(mode, "mode") # Default to use the (persistent) system temp directory rather than a TemporaryDirectory, # which would not be consistent between Airflow task invocations. if instance is None: if is_dagster_home_set(): instance = DagsterInstance.get() else: instance = DagsterInstance.local_temp( tempdir=seven.get_system_temp_directory()) check.inst_param(instance, "instance", DagsterInstance) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param(dag_description, "dag_description", _make_dag_description(pipeline_name)) check.subclass_param(operator, "operator", BaseOperator) dag_kwargs = dict( {"default_args": DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str), ) op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = recon_repo.get_definition().get_pipeline(pipeline_name) if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, run_config, mode=mode) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] operator_parameters = DagsterOperatorParameters( recon_repo=recon_repo, pipeline_name=pipeline_name, run_config=run_config, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), op_kwargs=op_kwargs, pipeline_snapshot=pipeline.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()), ) task = operator(operator_parameters) tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key( key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def test_all_step_events(): # pylint: disable=too-many-locals handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline) pipeline = handle.build_pipeline_definition() mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, {}, mode=mode) step_levels = execution_plan.topological_step_levels() run_config = RunConfig( executor_config=InProcessExecutorConfig(raise_on_error=False), storage_mode=RunStorageMode.FILESYSTEM, ) unhandled_events = STEP_EVENTS.copy() # Exclude types that are not step events ignored_events = { 'LogMessageEvent', 'PipelineStartEvent', 'PipelineSuccessEvent', 'PipelineInitFailureEvent', 'PipelineFailureEvent', } step_event_fragment = get_step_event_fragment() log_message_event_fragment = get_log_message_event_fragment() query = '\n'.join( ( PIPELINE_EXECUTION_QUERY_TEMPLATE.format( step_event_fragment=step_event_fragment.include_key, log_message_event_fragment=log_message_event_fragment.include_key, ), step_event_fragment.fragment, log_message_event_fragment.fragment, ) ) event_counts = defaultdict(int) for step_level in step_levels: for step in step_level: variables = { 'executionParams': { 'selector': {'name': pipeline.name}, 'environmentConfigData': {'storage': {'filesystem': {}}}, 'mode': mode, 'executionMetadata': {'runId': run_config.run_id}, 'stepKeys': [step.key], } } pipeline_run_storage = PipelineRunStorage() res = execute_query(handle, query, variables, pipeline_run_storage=pipeline_run_storage) # go through the same dict, decrement all the event records we've seen from the GraphQL # response if not res.get('errors'): run_logs = res['data']['startPipelineExecution']['run']['logs']['nodes'] events = [ dagster_event_from_dict(e, pipeline.name) for e in run_logs if e['__typename'] not in ignored_events ] for event in events: key = event.step_key + '.' + event.event_type_value event_counts[key] -= 1 unhandled_events -= {DagsterEventType(e.event_type_value) for e in events} # build up a dict, incrementing all the event records we've produced in the run storage logs = pipeline_run_storage.get_run_by_id(run_config.run_id).all_logs() for log in logs: if not log.dagster_event or ( DagsterEventType(log.dagster_event.event_type_value) not in STEP_EVENTS ): continue key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value event_counts[key] += 1 # Ensure we've processed all the events that were generated in the run storage assert sum(event_counts.values()) == 0 # Ensure we've handled the universe of event types assert not unhandled_events
def default_mode_output_versions(pipeline_def): return resolve_step_output_versions( create_execution_plan(pipeline_def), EnvironmentConfig.build(pipeline_def, {}, "default"), pipeline_def.get_mode_definition("default"), )
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle_kwargs=None, pipeline_run_dict=None, solid_subset=None, solid_handle_kwargs=None, instance_ref_dict=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.dict_param(pipeline_run_dict, 'pipeline_run_dict') check.dict_param(handle_kwargs, 'handle_kwargs') check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs') check.dict_param(instance_ref_dict, 'instance_ref_dict') try: handle = load_handle.handle_for_pipeline_cli_args( handle_kwargs, use_default_repository_yaml=False ) except (check.CheckError, load_handle.UsageError) as err: six.raise_from( DagstermillError( 'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded ' 'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, ' 'through dagster-graphql, or in-memory after loading it through an ' 'ExecutionTargetHandle.' ), err, ) try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except six.raise_from( DagstermillError( 'Error when attempting to resolve DagsterInstance from serialized InstanceRef' ), err, ) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()), PipelineDefinition, ).build_sub_pipeline(solid_subset) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition pipeline_run = unpack_value(pipeline_run_dict) self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run) with scoped_pipeline_context( self.pipeline_def, environment_dict, pipeline_run, instance, execution_plan, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, solid_config=None, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.system_storage_def ), ) return self.context
@solid def nonce_solid(_): return @pipeline def nonce_pipeline(): return nonce_solid() nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot() nonce_execution_plan_snapshot = snapshot_from_execution_plan( create_execution_plan(nonce_pipeline), nonce_pipeline.get_pipeline_snapshot_id()) def test_init_modified_docker_operator(dagster_docker_image): with instance_for_test() as instance: dagster_operator_parameters = DagsterOperatorParameters( task_id="nonce", run_config={"storage": { "filesystem": {} }}, pipeline_name="", mode="default", op_kwargs={ "image": dagster_docker_image, "api_version": "auto",
def test_retries_active_execution(): pipeline_def = define_diamond_pipeline() plan = create_execution_plan(pipeline_def) with plan.start(retry_mode=(RetryMode.ENABLED)) as active_execution: steps = active_execution.get_steps_to_execute() assert len(steps) == 1 step_1 = steps[0] assert step_1.key == "return_two" steps = active_execution.get_steps_to_execute() assert len(steps) == 0 # cant progress active_execution.mark_up_for_retry(step_1.key) steps = active_execution.get_steps_to_execute() assert len(steps) == 1 assert steps[0].key == "return_two" active_execution.mark_up_for_retry(step_1.key) steps = active_execution.get_steps_to_execute() assert len(steps) == 1 assert steps[0].key == "return_two" active_execution.mark_success(step_1.key) active_execution.mark_step_produced_output(StepOutputHandle(step_1.key, "result")) steps = active_execution.get_steps_to_execute() assert len(steps) == 2 step_2 = steps[0] step_3 = steps[1] assert step_2.key == "add_three" assert step_3.key == "mult_three" steps = active_execution.get_steps_to_execute() assert len(steps) == 0 # cant progress active_execution.mark_success(step_2.key) active_execution.mark_step_produced_output(StepOutputHandle(step_2.key, "result")) steps = active_execution.get_steps_to_execute() assert len(steps) == 0 # cant progress # uh oh failure active_execution.mark_failed(step_3.key) # cant progres to 4th step steps = active_execution.get_steps_to_execute() assert len(steps) == 0 assert not active_execution.is_complete steps = active_execution.get_steps_to_abandon() assert len(steps) == 1 step_4 = steps[0] assert step_4.key == "adder" active_execution.mark_abandoned(step_4.key) assert active_execution.is_complete
def test_fan_in_should_skip_step(): @lambda_solid def one(): return 1 @solid(output_defs=[OutputDefinition(is_required=False)]) def skip(_): return yield # pylint: disable=unreachable @solid def fan_in(_context, items): return items @composite_solid(output_defs=[OutputDefinition(is_required=False)]) def composite_all_upstream_skip(): return fan_in([skip(), skip()]) @composite_solid(output_defs=[OutputDefinition(is_required=False)]) def composite_one_upstream_skip(): return fan_in([one(), skip()]) @pipeline def optional_outputs_composite(): composite_all_upstream_skip() composite_one_upstream_skip() instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name="optional_outputs_composite", run_id=make_new_run_id()) execute_plan( create_execution_plan( optional_outputs_composite, step_keys_to_execute=[ "composite_all_upstream_skip.skip", "composite_all_upstream_skip.skip_2", ], ), InMemoryPipeline(optional_outputs_composite), instance, pipeline_run, ) # skip when all the step's sources weren't yield assert should_skip_step( create_execution_plan( optional_outputs_composite, step_keys_to_execute=["composite_all_upstream_skip.fan_in"], ), instance, pipeline_run.run_id, ) execute_plan( create_execution_plan( optional_outputs_composite, step_keys_to_execute=[ "composite_one_upstream_skip.one", "composite_one_upstream_skip.skip", ], ), InMemoryPipeline(optional_outputs_composite), instance, pipeline_run, ) # do not skip when some of the sources exist assert not should_skip_step( create_execution_plan( optional_outputs_composite, step_keys_to_execute=["composite_one_upstream_skip.fan_in"], ), instance, pipeline_run.run_id, )
def test_s3_asset_store_execution(mock_s3_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "asset_store": { "config": { "s3_bucket": mock_s3_bucket.name } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one.compute") step_keys = ["return_one.compute"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one.compute") asset_store = PickledObjectS3AssetStore(mock_s3_bucket.name, s3_prefix="dagster") step_output_handle = StepOutputHandle("return_one.compute") context = AssetStoreContext( step_output_handle.step_key, step_output_handle.output_name, {}, pipeline_def.name, pipeline_def.solid_def_named("return_one"), run_id, ) assert asset_store.get_asset(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one.compute") context = AssetStoreContext( step_output_handle.step_key, step_output_handle.output_name, {}, pipeline_def.name, pipeline_def.solid_def_named("add_one"), run_id, ) assert get_step_output(add_one_step_events, "add_one.compute") assert asset_store.get_asset(context) == 2
def test_adls2_object_manager_execution(storage_account, file_system, credential): pipeline_def = define_inty_pipeline() run_config = { "resources": { "object_manager": { "config": { "adls2_file_system": file_system } }, "adls2": { "config": { "storage_account": storage_account, "credential": { "key": credential } } }, } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("return_one"), ), ) object_manager = PickledObjectADLS2ObjectManager( file_system=file_system, adls2_client=create_adls2_client(storage_account, credential), blob_client=create_blob_client(storage_account, credential), ) assert object_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), pipeline_run=pipeline_run, run_config=run_config, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("add_one"), ), ) assert get_step_output(add_one_step_events, "add_one") assert object_manager.load_input(context) == 2
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick, stream): pipeline_def = pipeline.get_definition() # Run should_execute and halt if it returns False schedule_context = ScheduleExecutionContext(instance) with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of should_execute for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): should_execute = schedule_def.should_execute(schedule_context) if not should_execute: # Update tick to skipped state and return tick.update_with_status(ScheduleTickStatus.SKIPPED) stream.send(ScheduledExecutionSkipped()) return errors = [] run_config = {} schedule_tags = {} try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of run_config_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): run_config = schedule_def.get_run_config(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of tags_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): schedule_tags = schedule_def.get_tags(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) pipeline_tags = pipeline_def.tags or {} check_tags(pipeline_tags, 'pipeline_tags') tags = merge_dicts(pipeline_tags, schedule_tags) mode = schedule_def.mode execution_plan_snapshot = None try: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, ) execution_plan_snapshot = snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()) except DagsterInvalidConfigError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=schedule_def.pipeline_name, run_id=None, run_config=run_config, mode=mode, solids_to_execute=pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), ) tick.update_with_status(ScheduleTickStatus.SUCCESS, run_id=possibly_invalid_pipeline_run.run_id) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return # Otherwise the run should be valid so lets launch it # Need an ExternalPipeline to launch so make one here recon_repo = pipeline.get_reconstructable_repository() repo_location = InProcessRepositoryLocation(recon_repo) external_pipeline = repo_location.get_repository( recon_repo.get_definition().name).get_full_external_pipeline( pipeline_def.name) try: launched_run = instance.launch_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except DagsterLaunchFailedError: error = serializable_error_info_from_exc_info(sys.exc_info()) instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[error])) return stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id)) return
def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retry_mode = execute_step_args.retry_mode pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, known_state=execute_step_args.known_state, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_handle_for_single_step_plans().to_key( ), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan=execution_plan, pipeline=pipeline, pipeline_run=pipeline_run, instance=instance, retry_mode=retry_mode, run_config=pipeline_run.run_config, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def test_using_adls2_for_subplan(storage_account, file_system): pipeline_def = define_inty_pipeline() run_config = { "resources": { "adls2": { "config": {"storage_account": storage_account, "credential": get_azure_credential()} } }, "storage": {"adls2": {"config": {"adls2_file_system": file_system}}}, } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one.compute") step_keys = ["return_one.compute"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config ) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, "return_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["return_one.compute"]), run_config, pipeline_run, instance, ) as context: resource = context.scoped_resources_builder.build(required_resource_keys={"adls2"}).adls2 intermediate_storage = ADLS2IntermediateStorage( file_system=file_system, run_id=run_id, adls2_client=resource.adls2_client, blob_client=resource.blob_client, ) step_output_handle = StepOutputHandle("return_one.compute") assert intermediate_storage.has_intermediate(context, step_output_handle) assert intermediate_storage.get_intermediate(context, Int, step_output_handle).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(add_one_step_events, "add_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["add_one.compute"]), run_config, pipeline_run, instance, ) as context: step_output_handle = StepOutputHandle("add_one.compute") assert intermediate_storage.has_intermediate(context, step_output_handle) assert intermediate_storage.get_intermediate(context, Int, step_output_handle).obj == 2
def test_using_s3_for_subplan(s3_bucket): pipeline_def = define_inty_pipeline() environment_dict = { 'storage': { 's3': { 'config': { 's3_bucket': s3_bucket } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id)) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun.create_empty_run( pipeline_def.name, run_id=run_id, environment_dict=environment_dict) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, 'return_one.compute') with scoped_pipeline_context( pipeline_def, environment_dict, pipeline_run, instance, execution_plan.build_subset_plan(['return_one.compute']), ) as context: store = S3IntermediateStore( s3_bucket, run_id, s3_session=context.scoped_resources_builder.build( required_resource_keys={'s3'}, ).s3.session, ) assert store.has_intermediate(context, 'return_one.compute') assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(add_one_step_events, 'add_one.compute') with scoped_pipeline_context( pipeline_def, environment_dict, pipeline_run, instance, execution_plan.build_subset_plan(['add_one.compute']), ) as context: assert store.has_intermediate(context, 'add_one.compute') assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def test_create_execution_plan_with_bad_inputs(): with pytest.raises(DagsterInvalidConfigError): create_execution_plan( define_diamond_pipeline(), run_config={"solids": {"add_three": {"inputs": {"num": 3}}}}, )
def get_context(self, solid_config=None, mode_def=None, environment_dict=None): '''Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. environment_dict(Optional[dict]): The environment config dict with which to construct the context. Returns: :py:class:`~dagstermill.DagstermillExecutionContext` ''' check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) # If we are running non-interactively, and there is already a context reconstituted, return # that context rather than overwriting it. if self.context is not None and isinstance( self.context, DagstermillRuntimeExecutionContext ): return self.context if not mode_def: mode_def = ModeDefinition(logger_defs={'dagstermill': colored_console_logger}) environment_dict['loggers'] = {'dagstermill': {}} solid_def = SolidDefinition( name='this_solid', input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description='Ephemeral solid constructed by dagstermill.get_context()', required_resource_keys=mode_def.resource_key_set, ) pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline' ) run_id = make_new_run_id() # construct stubbed PipelineRun for notebook exploration... # The actual pipeline run during pipeline execution will be serialized and reconstituted # in the `reconstitute_pipeline_context` call pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, environment_dict=environment_dict, mode=mode_def.name, selector=None, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, tags=None, ) self.in_pipeline = False self.solid_def = solid_def self.pipeline_def = pipeline_def execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run) with scoped_pipeline_context( self.pipeline_def, environment_dict, pipeline_run, DagsterInstance.ephemeral(), execution_plan, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext( pipeline_context=pipeline_context, solid_config=solid_config, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.system_storage_def ), ) return self.context
def test_emr_pyspark_execution_plan(): os.environ['EMR_CLUSTER_ID'] = 'some_cluster_id' create_execution_plan(my_pipeline, mode='emr', run_config=emr_preset.run_config)
def test_can_reexecute(): pipeline_def = define_pipeline(fs_io_manager, {}) plan = create_execution_plan(pipeline_def) assert plan.artifacts_persisted
def _start_pipeline_execution(graphene_info, execution_params, is_reexecuted=False): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.inst_param(execution_params, 'execution_params', ExecutionParams) if is_reexecuted: # required fields for re-execution execution_metadata = check.inst_param( execution_params.execution_metadata, 'execution_metadata', ExecutionMetadata) check.str_param(execution_metadata.root_run_id, 'root_run_id') check.str_param(execution_metadata.parent_run_id, 'parent_run_id') error_type = ('StartPipelineExecutionDisabledError' if not is_reexecuted else 'DauphinStartPipelineReexecutionDisabledError') success_type = ('StartPipelineExecutionSuccess' if not is_reexecuted else 'StartPipelineReexecutionSuccess') instance = graphene_info.context.instance execution_manager_settings = instance.dagit_settings.get( 'execution_manager') if execution_manager_settings and execution_manager_settings.get( 'disabled'): return graphene_info.schema.type_named(error_type)() pipeline_def = get_pipeline_def_from_selector(graphene_info, execution_params.selector) get_validated_config( graphene_info, pipeline_def, environment_dict=execution_params.environment_dict, mode=execution_params.mode, ) execution_plan = create_execution_plan( pipeline_def, execution_params.environment_dict, run_config=RunConfig( mode=execution_params.mode, previous_run_id=execution_params.previous_run_id, tags=execution_params.execution_metadata.tags, ), ) _check_start_pipeline_execution_errors(graphene_info, execution_params, execution_plan) if execution_params.execution_metadata.run_id: # If a run_id is provided, use the old get_or_create_run machinery run = instance.get_or_create_run( _create_pipeline_run(instance, pipeline_def, execution_params)) else: # Otherwise we know we are creating a new run, and we can # use the new machinery that persists a pipeline snapshot # with the run. run = instance.create_run_with_snapshot( InstanceCreateRunArgs( pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), run_id=execution_params.execution_metadata.run_id if execution_params.execution_metadata.run_id else make_new_run_id(), selector=execution_params.selector, environment_dict=execution_params.environment_dict, mode=execution_params.mode, step_keys_to_execute=get_step_keys_to_execute( instance, pipeline_def, execution_params) or execution_params.step_keys, tags=execution_params.execution_metadata.tags, status=PipelineRunStatus.NOT_STARTED, root_run_id=(execution_params.execution_metadata.root_run_id or execution_params.previous_run_id), parent_run_id=( execution_params.execution_metadata.parent_run_id or execution_params.previous_run_id), )) graphene_info.context.execution_manager.execute_pipeline( graphene_info.context.get_handle(), pipeline_def, run, instance=instance, ) return graphene_info.schema.type_named(success_type)( run=graphene_info.schema.type_named('PipelineRun')(run))
def test_all_step_events(): # pylint: disable=too-many-locals handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline) pipeline = handle.build_pipeline_definition() mode = pipeline.get_default_mode_name() instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan(pipeline, mode=mode) pipeline_run = instance.create_run_for_pipeline( pipeline=pipeline, execution_plan=execution_plan, mode=mode) step_levels = execution_plan.topological_step_levels() unhandled_events = STEP_EVENTS.copy() # Exclude types that are not step events ignored_events = { 'LogMessageEvent', 'PipelineStartEvent', 'PipelineSuccessEvent', 'PipelineInitFailureEvent', 'PipelineFailureEvent', } event_counts = defaultdict(int) for step_level in step_levels: for step in step_level: variables = { 'executionParams': { 'selector': { 'name': pipeline.name }, 'environmentConfigData': { 'storage': { 'filesystem': {} } }, 'mode': mode, 'executionMetadata': { 'runId': pipeline_run.run_id }, 'stepKeys': [step.key], } } res = execute_query(handle, EXECUTE_PLAN_MUTATION, variables, instance=instance) # go through the same dict, decrement all the event records we've seen from the GraphQL # response if not res.get('errors'): assert 'data' in res, res assert 'executePlan' in res['data'], res assert 'stepEvents' in res['data']['executePlan'], res step_events = res['data']['executePlan']['stepEvents'] events = [ dagster_event_from_dict(e, pipeline.name) for e in step_events if e['__typename'] not in ignored_events ] for event in events: if event.step_key: key = event.step_key + '.' + event.event_type_value else: key = event.event_type_value event_counts[key] -= 1 unhandled_events -= { DagsterEventType(e.event_type_value) for e in events } else: raise Exception(res['errors']) # build up a dict, incrementing all the event records we've produced in the run storage logs = instance.all_logs(pipeline_run.run_id) for log in logs: if not log.dagster_event or (DagsterEventType( log.dagster_event.event_type_value) not in STEP_EVENTS.union( set([DagsterEventType.ENGINE_EVENT]))): continue if log.dagster_event.step_key: key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value else: key = log.dagster_event.event_type_value event_counts[key] += 1 # Ensure we've processed all the events that were generated in the run storage assert sum(event_counts.values()) == 0 # Ensure we've handled the universe of event types # Why are these retry events not handled? Because right now there is no way to configure retries # on executePlan -- this needs to change, and we should separate the ExecutionParams that get # sent to executePlan fromm those that get sent to startPipelineExecution and friends assert unhandled_events == { DagsterEventType.STEP_UP_FOR_RETRY, DagsterEventType.STEP_RESTARTED }
def test_using_gcs_for_subplan(gcs_bucket): pipeline_def = define_inty_pipeline() run_config = { "intermediate_storage": { "gcs": { "config": { "gcs_bucket": gcs_bucket } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") with scoped_pipeline_context( execution_plan.build_subset_plan(["return_one"]), run_config, pipeline_run, instance, ) as context: intermediate_storage = GCSIntermediateStorage( gcs_bucket, run_id, client=context.scoped_resources_builder.build( required_resource_keys={"gcs"}, ).gcs, ) assert intermediate_storage.has_intermediate( context, StepOutputHandle("return_one")) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("return_one")).obj == 1) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(add_one_step_events, "add_one") with scoped_pipeline_context( execution_plan.build_subset_plan(["return_one"]), run_config, pipeline_run, instance, ) as context: assert intermediate_storage.has_intermediate( context, StepOutputHandle("add_one")) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("add_one")).obj == 2)
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, solid_selection=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan) # note that solids_to_execute is required to execute the solid subset, which is the # frozenset version of the previous solid_subset. # solid_selection is not required and will not be converted to solids_to_execute here. # i.e. this function doesn't handle solid queries. # solid_selection is only used to pass the user queries further down. check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) check.opt_list_param(solid_selection, 'solid_selection', of_type=str) if solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline check.invariant( solids_to_execute == pipeline_def.solids_to_execute, 'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} ' 'that conflicts with solids_to_execute arg {solids_to_execute}'.format( pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute), solids_to_execute=str_format_list(solids_to_execute), ), ) else: # for cases when `create_run_for_pipeline` is directly called pipeline_def = pipeline_def.get_pipeline_subset_def( solids_to_execute=solids_to_execute ) if execution_plan is None: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, ) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=check.opt_str_param(mode, 'mode', default=pipeline_def.get_default_mode_name()), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id() ), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), )
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, environment_dict=None, mode=None, solid_subset=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan) if solid_subset: if isinstance(pipeline_def, PipelineSubsetForExecution): check.invariant( len(solid_subset) == len(pipeline_def.solid_subset) and set(solid_subset) == set(pipeline_def.solid_subset), 'Cannot create a PipelineRun from pipeline subset {pipeline_solid_subset} that ' 'conflicts with solid_subset arg {solid_subset}'.format( pipeline_solid_subset=str_format_list( pipeline_def.solid_subset), solid_subset=str_format_list(solid_subset), ), ) else: pipeline_def = pipeline_def.subset_for_execution( solid_subset=solid_subset) if execution_plan is None: execution_plan = create_execution_plan( pipeline_def, environment_dict=environment_dict, mode=mode, step_keys_to_execute=step_keys_to_execute, ) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, environment_dict=environment_dict, mode=check.opt_str_param( mode, 'mode', default=pipeline_def.get_default_mode_name()), solid_subset=solid_subset, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot( ), )
def test_tags_to_dynamic_plan(): @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "500m", "memory": "128Mi" }, "limits": { "cpu": "1000m", "memory": "1Gi" }, } } } }) def multiply_inputs(_, x): return 2 * x @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "250m", "memory": "64Mi" }, "limits": { "cpu": "500m", "memory": "2560Mi" }, } } } }, output_defs=[DynamicOutputDefinition()], ) def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i)) @pipeline def k8s_ready(): return emit().map(multiply_inputs) known_state = KnownExecutionState( {}, { emit.name: { "result": ["0", "1", "2"] }, }, ) plan = create_execution_plan(k8s_ready, known_state=known_state) emit_step = plan.get_step_by_key(emit.name) user_defined_k8s_config = get_user_defined_k8s_config(emit_step.tags) assert user_defined_k8s_config.container_config assert user_defined_k8s_config.container_config["resources"] resources = user_defined_k8s_config.container_config["resources"] assert resources["requests"]["cpu"] == "250m" assert resources["requests"]["memory"] == "64Mi" assert resources["limits"]["cpu"] == "500m" assert resources["limits"]["memory"] == "2560Mi" for mapping_key in range(3): multiply_inputs_step = plan.get_step_by_key( f"{multiply_inputs.name}[{mapping_key}]") dynamic_step_user_defined_k8s_config = get_user_defined_k8s_config( multiply_inputs_step.tags) assert dynamic_step_user_defined_k8s_config.container_config assert dynamic_step_user_defined_k8s_config.container_config[ "resources"] resources = dynamic_step_user_defined_k8s_config.container_config[ "resources"] assert resources["requests"]["cpu"] == "500m" assert resources["requests"]["memory"] == "128Mi" assert resources["limits"]["cpu"] == "1000m" assert resources["limits"]["memory"] == "1Gi"
def _do_execute_plan(graphene_info, execution_params, pipeline_def): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.inst_param(execution_params, 'execution_params', ExecutionParams) run_id = execution_params.execution_metadata.run_id pipeline_run = graphene_info.context.instance.get_run_by_id(run_id) if not pipeline_run: # TODO switch to raising a UserFacingError if the run_id cannot be found # https://github.com/dagster-io/dagster/issues/1876 pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, environment_dict=execution_params.environment_dict, mode=execution_params.mode or pipeline_def.get_default_mode_name(), tags=execution_params.execution_metadata.tags or {}, ) execution_plan = create_execution_plan( pipeline=pipeline_def, environment_dict=execution_params.environment_dict, run_config=pipeline_run, ) if execution_params.step_keys: for step_key in execution_params.step_keys: if not execution_plan.has_step(step_key): raise UserFacingGraphQLError( graphene_info.schema.type_named('InvalidStepError')(invalid_step_key=step_key) ) execution_plan = execution_plan.build_subset_plan(execution_params.step_keys) event_logs = [] def _on_event_record(record): if record.is_dagster_event: event_logs.append(record) graphene_info.context.instance.add_event_listener(run_id, _on_event_record) execute_plan( execution_plan=execution_plan, environment_dict=execution_params.environment_dict, pipeline_run=pipeline_run, instance=graphene_info.context.instance, ) dauphin_pipeline = DauphinPipeline.from_pipeline_def(pipeline_def) def to_graphql_event(event_record): return from_dagster_event_record( graphene_info, event_record, dauphin_pipeline, execution_plan ) return graphene_info.schema.type_named('ExecutePlanSuccess')( pipeline=dauphin_pipeline, has_failures=any( er for er in event_logs if er.is_dagster_event and er.dagster_event.event_type == DagsterEventType.STEP_FAILURE ), step_events=list(map(to_graphql_event, event_logs)), raw_event_records=list(map(serialize_dagster_namedtuple, event_logs)), )
def test_all_step_events(): # pylint: disable=too-many-locals handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline) pipeline = handle.build_pipeline_definition() mode = pipeline.get_default_mode_name() run_config = RunConfig(mode=mode) execution_plan = create_execution_plan(pipeline, {}, run_config=run_config) step_levels = execution_plan.topological_step_levels() unhandled_events = STEP_EVENTS.copy() # Exclude types that are not step events ignored_events = { 'LogMessageEvent', 'PipelineStartEvent', 'PipelineSuccessEvent', 'PipelineInitFailureEvent', 'PipelineFailureEvent', } event_counts = defaultdict(int) for step_level in step_levels: for step in step_level: variables = { 'executionParams': { 'selector': { 'name': pipeline.name }, 'environmentConfigData': { 'storage': { 'filesystem': {} } }, 'mode': mode, 'executionMetadata': { 'runId': run_config.run_id }, 'stepKeys': [step.key], } } instance = DagsterInstance.ephemeral() res = execute_query(handle, START_PIPELINE_EXECUTION_MUTATION, variables, instance=instance) # go through the same dict, decrement all the event records we've seen from the GraphQL # response if not res.get('errors'): run_logs = res['data']['startPipelineExecution']['run'][ 'logs']['nodes'] events = [ dagster_event_from_dict(e, pipeline.name) for e in run_logs if e['__typename'] not in ignored_events ] for event in events: if event.step_key: key = event.step_key + '.' + event.event_type_value else: key = event.event_type_value event_counts[key] -= 1 unhandled_events -= { DagsterEventType(e.event_type_value) for e in events } else: raise Exception(res['errors']) # build up a dict, incrementing all the event records we've produced in the run storage logs = instance.all_logs(run_config.run_id) for log in logs: if not log.dagster_event or ( DagsterEventType(log.dagster_event.event_type_value) not in STEP_EVENTS.union( set([DagsterEventType.ENGINE_EVENT]))): continue if log.dagster_event.step_key: key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value else: key = log.dagster_event.event_type_value event_counts[key] += 1 # Ensure we've processed all the events that were generated in the run storage assert sum(event_counts.values()) == 0 # Ensure we've handled the universe of event types assert not unhandled_events
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, run_config=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, ): """Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. """ check.opt_str_param(output_log_path, "output_log_path") check.opt_str_param(marshal_dir, "marshal_dir") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) check.dict_param(pipeline_run_dict, "pipeline_run_dict") check.dict_param(executable_dict, "executable_dict") check.dict_param(solid_handle_kwargs, "solid_handle_kwargs") check.dict_param(instance_ref_dict, "instance_ref_dict") pipeline = ReconstructablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except raise DagstermillError( "Error when attempting to resolve DagsterInstance from serialized InstanceRef" ) from err pipeline_run = unpack_value(pipeline_run_dict) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline execution_plan = create_execution_plan( self.pipeline, run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, solid_config=run_config.get("solids", {}).get(solid_def.name, {}).get("config"), resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
def test_emr_pyspark_execution_plan(): os.environ["EMR_CLUSTER_ID"] = "some_cluster_id" create_execution_plan(my_pipeline, mode="emr", run_config=emr_preset.run_config)
def test_s3_io_manager_execution(mock_s3_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "io_manager": { "config": { "s3_bucket": mock_s3_bucket.name } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectS3IOManager(mock_s3_bucket.name, construct_s3_client(max_attempts=5), s3_prefix="dagster") step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), config={}, metadata={}, upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, metadata={}, mapping_key=None, config=None, solid_def=pipeline_def.solid_def_named("return_one"), ), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), config={}, metadata={}, upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, metadata={}, mapping_key=None, config=None, solid_def=pipeline_def.solid_def_named("add_one"), ), ) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2