def test_cli_execute_failure(): # currently paths in env files have to be relative to where the # script has launched so we have to simulate that # with pytest.raises(DagsterExecutionStepExecutionError) as e_info: cwd = os.getcwd() try: os.chdir(file_relative_path(__file__, "../..")) with instance_for_test() as instance: result = do_execute_command( pipeline=ReconstructablePipeline.for_module( "dagster_pandas.examples", "pandas_hello_world_fails_test", ), instance=instance, config=[ file_relative_path( __file__, "../../dagster_pandas/examples/pandas_hello_world/*.yaml" ) ], ) failures = [ event for event in result.step_event_list if event.is_failure ] finally: # restore cwd os.chdir(cwd) assert len(failures) == 1 assert "I am a programmer and I make error" in failures[ 0].step_failure_data.error.cause.message
def test_pipelines_success(file_path, run_config_path): with pushd( file_relative_path(__file__, "../../../docs_snippets/legacy/data_science/")): with instance_for_test() as instance: run_config = load_yaml_from_path( run_config_path) if run_config_path else {} recon_pipeline = ReconstructablePipeline.for_file( file_path, "iris_classify") with tempfile.TemporaryDirectory() as temp_dir: run_config["resources"] = { "io_manager": { "config": { "base_dir": temp_dir } } } pipeline_result = execute_pipeline( recon_pipeline, run_config=run_config, instance=instance, solid_selection=["k_means_iris" ], # skip download_file in tests ) assert pipeline_result.success
def execute_pipeline_on_celery(pipeline_name, instance=None, run_config=None, tempdir=None, tags=None, subset=None): with tempdir_wrapper(tempdir) as tempdir: pipeline_def = ReconstructablePipeline.for_file( REPO_FILE, pipeline_name).subset_for_execution(subset) with _instance_wrapper(instance) as wrapped_instance: run_config = run_config or { "resources": { "io_manager": { "config": { "base_dir": tempdir } } }, "execution": { "celery": {} }, } result = execute_pipeline( pipeline_def, run_config=run_config, instance=wrapped_instance, tags=tags, ) yield result
def execute_run_host_mode( pipeline: ReconstructablePipeline, pipeline_run: PipelineRun, instance: DagsterInstance, executor_defs: Optional[List[ExecutorDefinition]] = None, raise_on_error: bool = False, ): check.inst_param(pipeline, "pipeline", ReconstructablePipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) check.opt_list_param(executor_defs, "executor_defs", of_type=ExecutorDefinition) executor_defs = executor_defs if executor_defs != None else default_executors if pipeline_run.status == PipelineRunStatus.CANCELED: message = "Not starting execution since the run was canceled before execution could start" instance.report_engine_event( message, pipeline_run, ) raise DagsterInvariantViolationError(message) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) if pipeline_run.solids_to_execute: pipeline = pipeline.subset_for_execution_from_existing_pipeline( frozenset(pipeline_run.solids_to_execute)) execution_plan_snapshot = instance.get_execution_plan_snapshot( pipeline_run.execution_plan_snapshot_id) execution_plan = ExecutionPlan.rebuild_from_snapshot( pipeline_run.pipeline_name, execution_plan_snapshot, ) _execute_run_iterable = ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=pipeline_execution_iterator, execution_context_manager=PlanOrchestrationContextManager( context_event_generator=host_mode_execution_context_event_generator, pipeline=pipeline, execution_plan=execution_plan, run_config=pipeline_run.run_config, pipeline_run=pipeline_run, instance=instance, raise_on_error=raise_on_error, executor_defs=executor_defs, output_capture=None, ), ) event_list = list(_execute_run_iterable) return event_list
def get_external_pipeline_subset_result( recon_pipeline: ReconstructablePipeline, solid_selection: Optional[List[str]]): check.inst_param(recon_pipeline, "recon_pipeline", ReconstructablePipeline) check.opt_list_param(solid_selection, "solid_selection", str) if solid_selection: try: sub_pipeline = recon_pipeline.subset_for_execution(solid_selection) definition = sub_pipeline.get_definition() except Exception: return ExternalPipelineSubsetResult( success=False, error=serializable_error_info_from_exc_info(sys.exc_info())) else: definition = recon_pipeline.get_definition() external_pipeline_data = external_pipeline_data_from_def(definition) return ExternalPipelineSubsetResult( success=True, external_pipeline_data=external_pipeline_data)
def test_yield_unserializable_result(): manager = Manager() assert manager.yield_result(threading.Lock()) with in_pipeline_manager( pipeline_name="hello_world_output_pipeline", solid_handle=NodeHandle("hello_world_output", None), executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_output_pipeline", ).to_dict(), step_key="hello_world_output", ) as manager: with pytest.raises(TypeError): manager.yield_result(threading.Lock())
def in_pipeline_manager( pipeline_name="hello_world_pipeline", solid_handle=NodeHandle("hello_world", None), step_key="hello_world", executable_dict=None, mode=None, **kwargs, ): manager = Manager() run_id = make_new_run_id() with instance_for_test() as instance: marshal_dir = tempfile.mkdtemp() if not executable_dict: executable_dict = ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_pipeline").to_dict() pipeline_run_dict = pack_value( PipelineRun( pipeline_name=pipeline_name, run_id=run_id, mode=mode or "default", run_config=None, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, )) try: with safe_tempfile_path() as output_log_file_path: context_dict = { "pipeline_run_dict": pipeline_run_dict, "solid_handle_kwargs": solid_handle._asdict(), "executable_dict": executable_dict, "marshal_dir": marshal_dir, "run_config": {}, "output_log_path": output_log_file_path, "instance_ref_dict": pack_value(instance.get_ref()), "step_key": step_key, } manager.reconstitute_pipeline_context( **dict(context_dict, **kwargs)) yield manager finally: shutil.rmtree(marshal_dir)
def exec_for_test(fn_name, env=None, raise_on_error=True, **kwargs): result = None recon_pipeline = ReconstructablePipeline.for_module("dagstermill.examples.repository", fn_name) with instance_for_test() as instance: try: result = execute_pipeline( recon_pipeline, env, instance=instance, raise_on_error=raise_on_error, **kwargs, ) yield result finally: if result: cleanup_result_notebook(result)
def test_engine_error(instance, tempdir): with mock.patch( "dagster.core.execution.context.system.PlanData.raise_on_error", return_value=True, ): with pytest.raises(DagsterSubprocessError): storage = os.path.join(tempdir, "flakey_storage") execute_pipeline( ReconstructablePipeline.for_file(REPO_FILE, "engine_error"), run_config={ "resources": {"io_manager": {"config": {"base_dir": storage}}}, "execution": { "celery": {"config": {"config_source": {"task_always_eager": True}}} }, "solids": {"destroy": {"config": storage}}, }, instance=instance, )
def test_dask_terminate(): run_config = { "solids": { "sleepy_dask_solid": { "inputs": { "df": { "read": { "csv": { "path": file_relative_path(__file__, "ex*.csv") } } } } } } } interrupt_thread = None result_types = [] with instance_for_test() as instance: for result in execute_pipeline_iterator( pipeline=ReconstructablePipeline.for_file( __file__, sleepy_dask_pipeline.name), run_config=run_config, instance=instance, ): # Interrupt once the first step starts if result.event_type == DagsterEventType.STEP_START and not interrupt_thread: interrupt_thread = Thread(target=send_interrupt, args=()) interrupt_thread.start() if result.event_type == DagsterEventType.STEP_FAILURE: assert ("DagsterExecutionInterruptedError" in result.event_specific_data.error.message) result_types.append(result.event_type) interrupt_thread.join() assert DagsterEventType.STEP_FAILURE in result_types assert DagsterEventType.PIPELINE_FAILURE in result_types
def test_in_pipeline_manager_with_resources(): with tempfile.NamedTemporaryFile() as fd: path = fd.name try: with in_pipeline_manager( pipeline_name="resource_pipeline", executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "resource_pipeline", ).to_dict(), solid_handle=NodeHandle("hello_world_resource", None), run_config={"resources": { "list": { "config": path } }}, mode="prod", step_key="hello_world_resource", ) as manager: assert "list" in manager.context.resources._asdict() with open(path, "rb") as fd: messages = pickle.load(fd) messages = [message.split(": ") for message in messages] assert len(messages) == 1 assert messages[0][1] == "Opened" manager.teardown_resources() with open(path, "rb") as fd: messages = pickle.load(fd) messages = [message.split(": ") for message in messages] assert len(messages) == 2 assert messages[1][1] == "Closed" finally: if os.path.exists(path): os.unlink(path)
def test_terminate_pipeline_on_celery(dagster_celery_worker, instance, tempdir): pipeline_def = ReconstructablePipeline.for_file(REPO_FILE, "interrupt_pipeline") run_config = { "resources": {"io_manager": {"config": {"base_dir": tempdir}}}, "execution": {"celery": {}}, } results = [] result_types = [] interrupt_thread = None for result in execute_pipeline_iterator( pipeline=pipeline_def, run_config=run_config, instance=instance, ): # Interrupt once the first step starts if result.event_type == DagsterEventType.STEP_START and not interrupt_thread: interrupt_thread = Thread(target=send_interrupt, args=()) interrupt_thread.start() results.append(result) result_types.append(result.event_type) interrupt_thread.join() # At least one step succeeded (the one that was running when the interrupt fired) assert DagsterEventType.STEP_SUCCESS in result_types # At least one step was revoked (and there were no step failure events) revoke_steps = [ result for result in results if result.event_type == DagsterEventType.ENGINE_EVENT and "was revoked." in result.message ] assert len(revoke_steps) > 0 # The overall pipeline failed assert DagsterEventType.PIPELINE_FAILURE in result_types
def test_dask(): run_config = { "solids": { "dask_solid": { "inputs": { "df": { "read": { "csv": { "path": file_relative_path(__file__, "ex*.csv") } } } } } } } with instance_for_test() as instance: result = execute_pipeline( ReconstructablePipeline.for_file(__file__, dask_pipeline.name), run_config={ "execution": { "dask": { "config": { "cluster": { "local": { "timeout": 30 } } } } }, **run_config, }, instance=instance, ) assert result.success
def test_event_callback_logging(): events = defaultdict(list) def _event_callback(record): assert isinstance(record, EventLogEntry) if record.is_dagster_event: events[record.dagster_event.event_type].append(record) pipeline = ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_logging_pipeline", ) pipeline_def = pipeline.get_definition() with instance_for_test() as instance: pipeline_run = instance.create_run_for_pipeline(pipeline_def) instance.watch_event_logs(pipeline_run.run_id, -1, _event_callback) res = execute_run( pipeline, pipeline_run, instance, ) assert res.success passed_before_timeout = False retries = 5 while retries > 0: time.sleep(0.333) if DagsterEventType.PIPELINE_FAILURE in events.keys(): break if DagsterEventType.PIPELINE_SUCCESS in events.keys(): passed_before_timeout = True break retries -= 1 assert passed_before_timeout
def test_cli_execute(): # currently paths in env files have to be relative to where the # script has launched so we have to simulate that cwd = os.getcwd() try: os.chdir(file_relative_path(__file__, "../..")) with instance_for_test() as instance: do_execute_command( pipeline=ReconstructablePipeline.for_module( "dagster_pandas.examples", "pandas_hello_world_test"), instance=instance, config=[ file_relative_path( __file__, "../../dagster_pandas/examples/pandas_hello_world/*.yaml" ) ], ) finally: # restore cwd os.chdir(cwd)
def test_execute_pipeline(): environment = { "ops": { "sum_op": { "inputs": { "num": { "csv": { "path": file_relative_path(__file__, "num.csv") } } } } } } result = execute_pipeline( ReconstructablePipeline.for_module("dagster_pandas.examples", "pandas_hello_world_test"), run_config=environment, ) assert result.success assert result.result_for_solid("sum_op").output_value().to_dict( "list") == { "num1": [1, 3], "num2": [2, 4], "sum": [3, 7], } assert result.result_for_solid("sum_sq_op").output_value().to_dict( "list") == { "num1": [1, 3], "num2": [2, 4], "sum": [3, 7], "sum_sq": [9, 49], }
def test_hello_world_reexecution(): with exec_for_test("hello_world_pipeline") as result: assert result.success output_notebook_path = get_path( [x for x in result.step_event_list if x.event_type_value == "ASSET_MATERIALIZATION"][0] ) with tempfile.NamedTemporaryFile("w+", suffix=".py") as reexecution_notebook_file: reexecution_notebook_file.write( ( "from dagster import pipeline\n" "from dagstermill import define_dagstermill_solid\n\n\n" "reexecution_solid = define_dagstermill_solid(\n" " 'hello_world_reexecution', '{output_notebook_path}'\n" ")\n\n" "@pipeline\n" "def reexecution_pipeline():\n" " reexecution_solid()\n" ).format(output_notebook_path=output_notebook_path) ) reexecution_notebook_file.flush() result = None reexecution_pipeline = ReconstructablePipeline.for_file( reexecution_notebook_file.name, "reexecution_pipeline" ) reexecution_result = None with instance_for_test() as instance: try: reexecution_result = execute_pipeline(reexecution_pipeline, instance=instance) assert reexecution_result.success finally: if reexecution_result: cleanup_result_notebook(reexecution_result)
def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retry_mode = execute_step_args.retry_mode pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, known_state=execute_step_args.known_state, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ MetadataEntry("step_keys", value=step_keys_str), MetadataEntry("Celery worker", value=self.request.hostname), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_handle_for_single_step_plans().to_key( ), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan=execution_plan, pipeline=pipeline, pipeline_run=pipeline_run, instance=instance, retry_mode=retry_mode, run_config=pipeline_run.run_config, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, run_config=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, step_key=None, ): """Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. """ check.opt_str_param(output_log_path, "output_log_path") check.opt_str_param(marshal_dir, "marshal_dir") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) check.dict_param(pipeline_run_dict, "pipeline_run_dict") check.dict_param(executable_dict, "executable_dict") check.dict_param(solid_handle_kwargs, "solid_handle_kwargs") check.dict_param(instance_ref_dict, "instance_ref_dict") check.str_param(step_key, "step_key") pipeline = ReconstructablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: raise DagstermillError( "Error when attempting to resolve DagsterInstance from serialized InstanceRef" ) from err pipeline_run = unpack_value(pipeline_run_dict) solid_handle = NodeHandle.from_dict(solid_handle_kwargs) solid = pipeline_def.get_solid(solid_handle) solid_def = solid.definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline resolved_run_config = ResolvedRunConfig.build(pipeline_def, run_config, mode=pipeline_run.mode) execution_plan = ExecutionPlan.build( self.pipeline, resolved_run_config, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, pipeline, run_config, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, pipeline_def=pipeline_def, solid_config=run_config.get("solids", {}).get(solid.name, {}).get("config"), resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_def, resolved_run_config, ), solid_name=solid.name, solid_handle=solid_handle, step_context=pipeline_context.for_step( execution_plan.get_step_by_key(step_key)), ) return self.context
def test_in_pipeline_manager_solid_config(): with in_pipeline_manager() as manager: assert manager.context.solid_config is None with in_pipeline_manager( pipeline_name="hello_world_config_pipeline", solid_handle=NodeHandle("hello_world_config", None), executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_config_pipeline", ).to_dict(), step_key="hello_world_config", ) as manager: assert manager.context.solid_config == {"greeting": "hello"} with in_pipeline_manager( pipeline_name="hello_world_config_pipeline", solid_handle=NodeHandle("hello_world_config", None), run_config={ "solids": { "hello_world_config": { "config": { "greeting": "bonjour" } }, "goodbye_config": { "config": { "farewell": "goodbye" } }, } }, executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_config_pipeline", ).to_dict(), step_key="hello_world_config", ) as manager: assert manager.context.solid_config == {"greeting": "bonjour"} with in_pipeline_manager( pipeline_name="hello_world_config_pipeline", solid_handle=NodeHandle("goodbye_config", None), run_config={ "solids": { "hello_world_config": { "config": { "greeting": "bonjour" }, }, "goodbye_config": { "config": { "farewell": "goodbye" } }, } }, executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_config_pipeline", ).to_dict(), step_key="goodbye_config", ) as manager: assert manager.context.solid_config == {"farewell": "goodbye"}
def step_context_to_step_run_ref( step_context: StepExecutionContext, prior_attempts_count: int, package_dir: Optional[str] = None, ) -> StepRunRef: """ Args: step_context (StepExecutionContext): The step context. prior_attempts_count (int): The number of times this time has been tried before in the same pipeline run. package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted to be relative a module pointer relative to the package root. This enables executing steps in remote setups where the package containing the pipeline resides at a different location on the filesystem in the remote environment than in the environment executing the plan process. Returns (StepRunRef): A reference to the step. """ check.inst_param(step_context, "step_context", StepExecutionContext) check.int_param(prior_attempts_count, "prior_attempts_count") retry_mode = step_context.retry_mode recon_pipeline = step_context.pipeline if package_dir: if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance( recon_pipeline.repository.pointer, FileCodePointer ): recon_pipeline = ReconstructablePipeline( repository=ReconstructableRepository( pointer=ModuleCodePointer( _module_in_package_dir( recon_pipeline.repository.pointer.python_file, package_dir ), recon_pipeline.repository.pointer.fn_name, working_directory=os.getcwd(), ), container_image=recon_pipeline.repository.container_image, executable_path=recon_pipeline.repository.executable_path, entry_point=recon_pipeline.repository.entry_point, container_context=recon_pipeline.repository.container_context, ), pipeline_name=recon_pipeline.pipeline_name, solids_to_execute=recon_pipeline.solids_to_execute, ) upstream_output_events, run_group = _upstream_events_and_runs(step_context) return StepRunRef( run_config=step_context.run_config, pipeline_run=step_context.pipeline_run, run_id=step_context.pipeline_run.run_id, step_key=step_context.step.key, retry_mode=retry_mode, recon_pipeline=recon_pipeline, # type: ignore prior_attempts_count=prior_attempts_count, known_state=step_context.execution_plan.known_state, run_group=run_group, upstream_output_events=upstream_output_events, )
import os import tempfile # pylint: disable=unused-argument import pytest from dagster import execute_pipeline, file_relative_path from dagster.core.definitions.reconstruct import ReconstructablePipeline from dagster.core.test_utils import instance_for_test from dagster.utils import load_yaml_from_globs ingest_pipeline = ReconstructablePipeline.for_module( "airline_demo.pipelines", "define_airline_demo_ingest_pipeline", ) warehouse_pipeline = ReconstructablePipeline.for_module( "airline_demo.pipelines", "define_airline_demo_warehouse_pipeline", ) def config_path(relative_path): return file_relative_path( __file__, os.path.join("../airline_demo/environments/", relative_path)) @pytest.mark.db @pytest.mark.nettest @pytest.mark.py3 @pytest.mark.spark