def test_ecs_run_launcher_inits(): DagsterInstance.local_temp( overrides={ "run_launcher": { "module": "dagster_aws.ecs.launcher", "class": "ECSRunLauncher" } })
def test_workspace_yamls(): with load_workspace_process_context_from_yaml_paths( DagsterInstance.ephemeral(), [ file_relative_path( __file__, "../../../docs_snippets/concepts/repositories_workspaces/workspace.yaml", ) ], ) as workspace_process_context: assert workspace_process_context.repository_locations_count == 1 with load_workspace_process_context_from_yaml_paths( DagsterInstance.ephemeral(), [ file_relative_path( __file__, "../../../docs_snippets/concepts/repositories_workspaces/workspace_working_directory.yaml", ) ], ) as workspace_process_context: assert workspace_process_context.repository_locations_count == 2 with load_workspace_process_context_from_yaml_paths( DagsterInstance.ephemeral(), [ file_relative_path( __file__, "../../../docs_snippets/concepts/repositories_workspaces/workspace_one_repository.yaml", ) ], ) as workspace_process_context: assert workspace_process_context.repository_locations_count == 1 with load_workspace_process_context_from_yaml_paths( DagsterInstance.ephemeral(), [ file_relative_path( __file__, "../../../docs_snippets/concepts/repositories_workspaces/workspace_python_package.yaml", ) ], ) as workspace_process_context: assert workspace_process_context.repository_locations_count == 1 with load_workspace_process_context_from_yaml_paths( DagsterInstance.ephemeral(), [ file_relative_path( __file__, "../../../docs_snippets/concepts/repositories_workspaces/workspace_grpc.yaml", ) ], ) as workspace_process_context: assert workspace_process_context.repository_locations_count == 1
def monitor_starting_run(instance: DagsterInstance, run, logger): check.invariant(run.status == PipelineRunStatus.STARTING) run_stats = instance.get_run_stats(run.run_id) check.invariant(run_stats.launch_time is not None, "Run in status STARTING doesn't have a launch time.") if time.time( ) - run_stats.launch_time >= instance.run_monitoring_start_timeout_seconds: msg = ( f"Run {run.run_id} has been running for {time.time() - run_stats.launch_time} seconds, " f"which is longer than the timeout of {instance.run_monitoring_start_timeout_seconds} seconds to start. " "Marking run failed") logger.info(msg) instance.report_run_failed(run, msg)
def test_run_status_sensor(): @run_status_sensor(pipeline_run_status=DagsterRunStatus.SUCCESS) def status_sensor(context): assert context.dagster_event.event_type_value == "PIPELINE_SUCCESS" @op def succeeds(): return 1 @job def my_job_2(): succeeds() instance = DagsterInstance.ephemeral() result = my_job_2.execute_in_process(instance=instance, raise_on_error=False) dagster_run = result.dagster_run dagster_event = result.get_job_success_event() context = build_run_status_sensor_context( sensor_name="status_sensor", dagster_instance=instance, dagster_run=dagster_run, dagster_event=dagster_event, ) status_sensor(context)
def dagit_debug_command(input_files, port, asgi): debug_payloads = [] for input_file in input_files: click.echo("Loading {} ...".format(input_file)) with GzipFile(input_file, "rb") as file: blob = file.read().decode("utf-8") debug_payload = deserialize_json_to_dagster_namedtuple(blob) check.invariant(isinstance(debug_payload, DebugRunPayload)) click.echo("\trun_id: {} \n\tdagster version: {}".format( debug_payload.pipeline_run.run_id, debug_payload.version)) debug_payloads.append(debug_payload) instance = DagsterInstance.ephemeral(preload=debug_payloads) if asgi: uvicorn.run( DagitWebserver( WorkspaceProcessContext( instance, None, version=__version__)).create_asgi_app(debug=True), port=port, ) else: host_dagit_ui_with_workspace_process_context( workspace_process_context=WorkspaceProcessContext( instance, None, version=__version__), port=port, port_lookup=True, host=DEFAULT_DAGIT_HOST, path_prefix="", )
def test_yielded_results_config_pandas(snapshot): run_config = { "resources": { "ge_data_context": { "config": { "ge_root_dir": file_relative_path(__file__, "./great_expectations") } } } } result = execute_pipeline( reconstructable(hello_world_pandas_pipeline), run_config=run_config, mode="basic", instance=DagsterInstance.local_temp(), ) assert result.result_for_solid( "reyielder").output_value()[0]["success_percent"] == 100 expectations = result.result_for_solid( "ge_validation_solid").expectation_results_during_compute assert len(expectations) == 1 mainexpect = expectations[0] assert mainexpect.success # purge system specific metadata for testing metadata = mainexpect.metadata_entries[0].entry_data.md_str.split( "### Info")[0] snapshot.assert_match(metadata)
def test_yielded_results_config(): run_config = { 'resources': { 'ge_data_context': { 'config': { 'ge_root_dir': file_relative_path(__file__, "./great_expectations") } } } } result = execute_pipeline( reconstructable(hello_world_pipeline), run_config=run_config, mode='basic', instance=DagsterInstance.local_temp(), ) assert result.result_for_solid( "reyielder").output_value()[0]["success_percent"] == 100 expectations = result.result_for_solid( "ge_validation_solid").expectation_results_during_compute assert len(expectations) == 1 mainexpect = expectations[0] assert mainexpect.success metadata = mainexpect.metadata_entries[0].entry_data.data assert metadata['overall'] == { 'evaluated_expectations': 11, 'success_percent': 100.0, 'successful_expectations': 11, 'unsuccessful_expectations': 0, }
def import_command(input_files: Tuple[str, ...]): debug_payloads = [] for input_file in input_files: with GzipFile(input_file, "rb") as file: blob = file.read().decode("utf-8") debug_payload = deserialize_as(blob, DebugRunPayload) debug_payloads.append(debug_payload) with DagsterInstance.get() as instance: for debug_payload in debug_payloads: run = debug_payload.pipeline_run click.echo(f"Importing run {run.run_id} (Dagster: {debug_payload.version})") if not instance.has_snapshot(run.execution_plan_snapshot_id): instance.add_snapshot( debug_payload.execution_plan_snapshot, run.execution_plan_snapshot_id, ) if not instance.has_snapshot(run.pipeline_snapshot_id): instance.add_snapshot( debug_payload.pipeline_snapshot, run.pipeline_snapshot_id, ) if not instance.has_run(run.run_id): instance.add_run(run) for event in tqdm(debug_payload.event_list): instance.store_event(event)
def define_test_snapshot_context(): return DagsterSnapshotGraphQLContext( instance=DagsterInstance.ephemeral(), execution_manager=SynchronousExecutionManager(), repository_snapshot=RepositorySnapshot.from_repository_definition( define_repository()), )
def test_execute_pipeline_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) pipeline = PipelineDefinition( name='basic_resource_pipeline', solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={'a': resource_a, 'b': resource_b}, logger_defs={'callback': construct_event_logger(event_callback)}, ) ], ) iterator = execute_pipeline_iterator( pipeline, environment_dict={'loggers': {'callback': {}}}, instance=DagsterInstance.local_temp(), ) event_type = None while event_type != 'STEP_START': event = next(iterator) event_type = event.event_type_value iterator.close() events = [record.dagster_event for record in records if record.is_dagster_event] messages = [record.user_message for record in records if not record.is_dagster_event] assert len([event for event in events if event.is_pipeline_failure]) > 0 assert len([message for message in messages if message == 'CLEANING A']) > 0 assert len([message for message in messages if message == 'CLEANING B']) > 0
def test_multiple_local_cluster(): cluster_configs = [ { "n_workers": 1, "threads_per_worker": 2, "dashboard_address": None, }, { "n_workers": 2, "threads_per_worker": 1, "dashboard_address": None, }, ] for cluster_config in cluster_configs: run_config = { "resources": { "dask": { "config": { "cluster": { "local": cluster_config } } } } } result = execute_pipeline( scheduler_info_pipeline, run_config=run_config, instance=DagsterInstance.local_temp(), ) _assert_scheduler_info_result(result, cluster_config)
def test_multiple_outputs_only_emit_one_multiproc(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_multi_out' ).build_pipeline_definition() result = execute_pipeline( pipe, environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, instance=DagsterInstance.local_temp(), ) assert result.success solid_result = result.result_for_solid('multiple_outputs') assert set(solid_result.output_values.keys()) == set(['output_one']) with pytest.raises( DagsterInvariantViolationError, match="Output 'not_defined' not defined in solid 'multiple_outputs'", ): solid_result.output_value('not_defined') with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'): solid_result.output_value('output_two') with pytest.raises( DagsterInvariantViolationError, match=( 'Tried to get result for solid not_present in multiple_outputs_only_emit_one_pipeline. ' 'No such top level solid.' ), ): result.result_for_solid('not_present') assert result.result_for_solid('downstream_two').skipped
def test_input_manager_with_failure(): @root_input_manager def should_fail(_): raise Failure( description="Foolure", metadata_entries=[ EventMetadataEntry.text(label="label", text="text", description="description") ], ) @solid(input_defs=[InputDefinition("_fail_input", root_manager_key="should_fail")]) def fail_on_input(_, _fail_input): assert False, "should not be called" @pipeline(mode_defs=[ModeDefinition(resource_defs={"should_fail": should_fail})]) def simple(): fail_on_input() with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) assert not result.success failure_data = result.result_for_solid("fail_on_input").failure_data assert failure_data.error.cls_name == "Failure" assert failure_data.user_failure_data.description == "Foolure" assert failure_data.user_failure_data.metadata_entries[0].label == "label" assert failure_data.user_failure_data.metadata_entries[0].entry_data.text == "text" assert failure_data.user_failure_data.metadata_entries[0].description == "description"
def test_run_failure_sensor(): @run_failure_sensor def failure_sensor(context): assert context.dagster_event.event_type_value == "PIPELINE_FAILURE" @op def will_fail(): raise Exception("failure") @job def my_job(): will_fail() instance = DagsterInstance.ephemeral() result = my_job.execute_in_process(instance=instance, raise_on_error=False) dagster_run = result.dagster_run dagster_event = result.get_job_failure_event() context = build_run_status_sensor_context( sensor_name="failure_sensor", dagster_instance=instance, dagster_run=dagster_run, dagster_event=dagster_event, ).for_run_failure() failure_sensor(context)
def dagit_debug_command(input_files, port): debug_payloads = [] for input_file in input_files: click.echo("Loading {} ...".format(input_file)) with GzipFile(input_file, "rb") as file: blob = file.read().decode() debug_payload = deserialize_json_to_dagster_namedtuple(blob) check.invariant(isinstance(debug_payload, DebugRunPayload)) click.echo( "\trun_id: {} \n\tdagster version: {}".format( debug_payload.pipeline_run.run_id, debug_payload.version ) ) debug_payloads.append(debug_payload) instance = DagsterInstance.ephemeral(preload=debug_payloads) host_dagit_ui_with_workspace( workspace=Workspace([]), instance=instance, port=port, port_lookup=True, host=DEFAULT_DAGIT_HOST, path_prefix="", )
def test_multiprocess_executor(): result = execute_pipeline( run_config={ # This section controls how the run will be executed. # The multiprocess executor runs each step in its own sub process. "execution": { "multiprocess": {} }, # This section controls how values will be passed from one solid to the next. # The default is in memory, so here we set it to filesystem to allow the # separate subprocess to get the values "intermediate_storage": { "filesystem": {} }, }, # The default instance for this API is an in memory ephemeral one. # To allow the multiple processes to coordinate we use one here # backed by a temporary directory. instance=DagsterInstance.local_temp(), # A ReconstructablePipeline is necessary to load the pipeline in child processes. # reconstructable() is a utility function that captures where the # PipelineDefinition came from. pipeline=reconstructable(predict_color), ) assert result.success
def test_resolve_memoized_execution_plan_yes_stored_results(): speculative_execution_plan = create_execution_plan(versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input.compute", "result") instance = DagsterInstance.ephemeral() instance.get_addresses_for_step_output_versions = mock.MagicMock( return_value={(versioned_pipeline.name, step_output_handle): "some_address"} ) memoized_execution_plan = instance.resolve_memoized_execution_plan( speculative_execution_plan, run_config={}, mode="default" ) assert memoized_execution_plan.step_keys_to_execute == ["versioned_solid_takes_input.compute"] expected_handle = StepOutputHandle( step_key="versioned_solid_no_input.compute", output_name="result" ) assert ( memoized_execution_plan.step_dict["versioned_solid_takes_input.compute"] .step_input_dict["intput"] .source.step_output_handle == expected_handle )
def run_daemon_loop( self, instance_ref, daemon_uuid, daemon_shutdown_event, gen_workspace, heartbeat_interval_seconds, error_interval_seconds, until=None, ): from dagster.core.telemetry_upload import uploading_logging_thread # Each loop runs in its own thread with its own instance and IWorkspace with DagsterInstance.from_ref(instance_ref) as instance: with uploading_logging_thread(): with gen_workspace(instance) as workspace: check.inst_param(workspace, "workspace", IWorkspace) daemon_generator = self.core_loop(instance, workspace) try: while (not daemon_shutdown_event.is_set()) and ( not until or pendulum.now("UTC") < until ): try: result = check.opt_inst( next(daemon_generator), SerializableErrorInfo ) if result: self._errors.appendleft((result, pendulum.now("UTC"))) except StopIteration: self._logger.error( "Daemon loop finished without raising an error - daemon loops should run forever until they are interrupted." ) break except Exception: error_info = serializable_error_info_from_exc_info(sys.exc_info()) self._logger.error( "Caught error, daemon loop will restart:\n{}".format(error_info) ) self._errors.appendleft((error_info, pendulum.now("UTC"))) daemon_generator.close() daemon_generator = self.core_loop(instance, workspace) finally: try: self._check_add_heartbeat( instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds, ) except Exception: self._logger.error( "Failed to add heartbeat: \n{}".format( serializable_error_info_from_exc_info(sys.exc_info()) ) ) finally: # cleanup the generator if it was stopped part-way through daemon_generator.close()
def temp_instance(): with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance.local_temp(temp_dir) try: yield instance finally: instance.run_launcher.join()
def test_custom_path_asset_store(): with seven.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.ephemeral() run_config = { "resources": {"fs_asset_store": {"config": {"base_dir": tmpdir_path}}}, } result = execute_pipeline( custom_path_pipeline, run_config=run_config, mode="test", instance=instance ) assert result.success filepath_call_api = os.path.join(tmpdir_path, "call_api_output") assert os.path.isfile(filepath_call_api) with open(filepath_call_api, "rb") as read_obj: assert pickle.load(read_obj) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] filepath_parse_df = os.path.join(tmpdir_path, "parse_df_output") assert os.path.isfile(filepath_parse_df) with open(filepath_parse_df, "rb") as read_obj: assert pickle.load(read_obj) == [1, 2, 3, 4, 5] assert reexecute_pipeline( custom_path_pipeline, result.run_id, run_config=run_config, mode="test", instance=instance, step_selection=["parse_df.compute*"], ).success
def define_test_snapshot_context(): return DagsterGraphQLOutOfProcessRepositoryContext( instance=DagsterInstance.ephemeral(), execution_manager=SynchronousExecutionManager(), external_repository=ExternalRepository.from_repository_def( define_repository()), )
def execute_pipeline_with_steps(pipeline_def, step_keys_to_execute=None): plan = create_execution_plan(pipeline_def, step_keys_to_execute=step_keys_to_execute) with DagsterInstance.ephemeral() as instance: pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, step_keys_to_execute=step_keys_to_execute, ) return execute_plan(plan, instance, pipeline_run)
def test_builtin_pipeline(): with TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.ephemeral() run_config = { "resources": { "object_manager": { "config": { "base_dir": tmpdir_path } } }, } result = execute_pipeline(asset_store_pipeline, run_config=run_config, mode="test", instance=instance) assert result.success filepath_call_api = os.path.join(tmpdir_path, result.run_id, "call_api", "result") assert os.path.isfile(filepath_call_api) with open(filepath_call_api, "rb") as read_obj: assert pickle.load(read_obj) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] filepath_parse_df = os.path.join(tmpdir_path, result.run_id, "parse_df", "result") assert os.path.isfile(filepath_parse_df) with open(filepath_parse_df, "rb") as read_obj: assert pickle.load(read_obj) == [1, 2, 3, 4, 5]
def run_loop( self, daemon_uuid, daemon_shutdown_event, gen_workspace, heartbeat_interval_seconds, error_interval_seconds, until=None, ): # Each loop runs in its own thread with its own instance and IWorkspace with DagsterInstance.get() as instance: with gen_workspace(instance) as workspace: check.inst_param(workspace, "workspace", IWorkspace) while not daemon_shutdown_event.is_set() and ( not until or pendulum.now("UTC") < until): curr_time = pendulum.now("UTC") if (not self._last_iteration_time or (curr_time - self._last_iteration_time ).total_seconds() >= self.interval_seconds): self._last_iteration_time = curr_time self._run_iteration( instance, daemon_uuid, daemon_shutdown_event, workspace, heartbeat_interval_seconds, error_interval_seconds, until, ) self._check_add_heartbeat(instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds) daemon_shutdown_event.wait(0.5)
def test_default_object_manager_reexecution(): with seven.TemporaryDirectory() as tmpdir_path: default_asset_store = fs_object_manager.configured( {"base_dir": tmpdir_path}) pipeline_def = define_pipeline(default_asset_store, {}) instance = DagsterInstance.ephemeral() result = execute_pipeline(pipeline_def, instance=instance) assert result.success re_result = reexecute_pipeline( pipeline_def, result.run_id, instance=instance, step_selection=["solid_b"], ) # re-execution should yield asset_store_operation events instead of intermediate events get_asset_events = list( filter( lambda evt: evt.is_asset_store_operation and AssetStoreOperationType(evt.event_specific_data.op ) == AssetStoreOperationType.GET_ASSET, re_result.event_list, )) assert len(get_asset_events) == 1 assert get_asset_events[0].event_specific_data.step_key == "solid_a"
def test_shameful_workaround(): graphql_context = define_test_out_of_process_context( DagsterInstance.ephemeral()) pipeline_run = graphql_context.instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=csv_hello_world_solids_config()) result = execute_dagster_graphql( graphql_context, EXECUTE_RUN_IN_PROCESS_MUTATION, variables={ 'runId': pipeline_run.run_id, # in corect in process name represents launching from user process 'repositoryLocationName': IN_PROCESS_NAME, 'repositoryName': main_repo_name(), }, ) assert result.data assert result.data['executeRunInProcess'][ '__typename'] == 'ExecuteRunInProcessSuccess' result = execute_dagster_graphql( graphql_context, EXECUTE_RUN_IN_PROCESS_MUTATION, variables={ 'runId': pipeline_run.run_id, # but we don't apply workaround to other names 'repositoryLocationName': 'some_other_name', 'repositoryName': main_repo_name(), }, ) assert result.data assert result.data['executeRunInProcess'][ '__typename'] == 'PipelineNotFoundError'
def test_pipeline_run_creation_race(): with seven.TemporaryDirectory() as tempdir: instance = DagsterInstance.local_temp(tempdir) run_id = 'run_id' # Spy on the result of add_run add_run_spy = Spy(instance._run_storage.add_run) # pylint: disable=protected-access add_run_mock = mock.MagicMock(side_effect=add_run_spy) instance._run_storage.add_run = add_run_mock # pylint: disable=protected-access # This invocation should successfully add the run to run storage pipeline_run = register_managed_run_for_test(instance, run_id=run_id) assert len(add_run_mock.call_args_list) == 1 assert instance.has_run(run_id) # Check that add_run did not receive DagsterRunAlreadyExists exception and that # it successfully returned assert add_run_spy.exceptions == [] assert len(add_run_spy.return_values) == 1 # (*) Simulate a race where second invocation receives has_run() is False fetched_pipeline_run = '' with mock.patch.object(instance, 'has_run', mock.MagicMock(return_value=False)): fetched_pipeline_run = register_managed_run_for_test(instance, run_id=run_id) # Check that add_run received DagsterRunAlreadyExists exception and did not return value assert len(add_run_mock.call_args_list) == 2 assert add_run_spy.exceptions == [DagsterRunAlreadyExists] assert len(add_run_spy.return_values) == 1 assert pipeline_run == fetched_pipeline_run assert instance.has_run(run_id) assert len(instance.get_runs()) == 1
def test_multiproc_markers(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_pipeline' ).build_pipeline_definition() instance = DagsterInstance.local_temp() result = execute_pipeline( pipe, instance=instance, environment_dict={'execution': {'multiprocess': {}}, 'storage': {'filesystem': {}}}, ) assert result.success events = instance.all_logs(result.run_id) start_markers = {} end_markers = {} for event in events: dagster_event = event.dagster_event if dagster_event.is_engine_event: if dagster_event.engine_event_data.marker_start: key = '{step}.{marker}'.format( step=event.step_key, marker=dagster_event.engine_event_data.marker_start ) start_markers[key] = event.timestamp if dagster_event.engine_event_data.marker_end: key = '{step}.{marker}'.format( step=event.step_key, marker=dagster_event.engine_event_data.marker_end ) end_markers[key] = event.timestamp seen = set() assert set(start_markers.keys()) == set(end_markers.keys()) for key in end_markers: assert end_markers[key] - start_markers[key] > 0 seen.add(key) assert 'ping.compute.multiprocess_subprocess_init' in end_markers
def test_sync_run_launcher_run(): with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance.local_temp( temp_dir, overrides={ "run_launcher": { "module": "dagster.core.launcher.sync_in_memory_run_launcher", "class": "SyncInMemoryRunLauncher", } }, ) external_repo = get_main_external_repo(instance) external_pipeline = external_repo.get_full_external_pipeline( "noop_pipeline") run = create_run_for_test(instance=instance, pipeline_name=external_pipeline.name) run = instance.run_launcher.launch_run( instance=instance, run=run, external_pipeline=external_pipeline) completed_run = instance.get_run_by_id(run.run_id) assert completed_run.is_success
def test_materialized_assets(): instance = DagsterInstance.ephemeral() res = execute_pipeline(materialization_pipeline, instance=instance) assert res.success asset_keys = instance.all_asset_keys() assert len(asset_keys) == 1 assert asset_keys[0] == AssetKey(["dashboards", "analytics_dashboard"])