def test_execute_on_celery_k8s_with_hard_failure( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, set_dagster_k8s_pipeline_run_namespace_env, dagit_url): run_config = merge_dicts( merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"}, ), ), {"solids": { "hard_fail_or_0": { "config": { "fail": True } } }}, ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="hard_failer") # Check that pipeline run is marked as failed pipeline_run_status_failure = False start_time = datetime.datetime.now() timeout = datetime.timedelta(0, 120) while datetime.datetime.now() < start_time + timeout: pipeline_run = dagster_instance.get_run_by_id(run_id) if pipeline_run.status == PipelineRunStatus.FAILURE: pipeline_run_status_failure = True break time.sleep(5) assert pipeline_run_status_failure # Check for step failure for hard_fail_or_0.compute start_time = datetime.datetime.now() step_failure_found = False while datetime.datetime.now() < start_time + timeout: event_records = dagster_instance.all_logs(run_id) for event_record in event_records: if event_record.dagster_event: if (event_record.dagster_event.event_type == DagsterEventType.STEP_FAILURE and event_record.dagster_event.step_key == "hard_fail_or_0"): step_failure_found = True break time.sleep(5) assert step_failure_found
def test_execute_on_celery_k8s_default( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="demo_pipeline_celery") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
def _root_manager(input_context: InputContext) -> Any: source_asset_key = cast(AssetKey, input_context.asset_key) source_asset = source_assets_by_key[source_asset_key] @op(out={source_asset_key.path[-1]: Out(asset_key=source_asset_key)}) def _op(): pass output_context = build_output_context( name=source_asset_key.path[-1], step_key="none", solid_def=_op, metadata=merge_dicts(source_asset.metadata or {}, {"logical_asset_key": source_asset_key}), ) input_context_with_upstream = build_input_context( name=input_context.name, metadata=input_context.metadata, config=input_context.config, dagster_type=input_context.dagster_type, upstream_output=output_context, op_def=input_context.op_def, ) io_manager = getattr(cast(Any, input_context.resources), source_asset.io_manager_key) return io_manager.load_input(input_context_with_upstream)
def make_run_config(scratch_dir, mode): if mode in ["external", "request_retry"]: step_launcher_resource_keys = [ "first_step_launcher", "second_step_launcher" ] else: step_launcher_resource_keys = ["second_step_launcher"] return deep_merge_dicts( RUN_CONFIG_BASE, { "resources": merge_dicts( {"io_manager": { "config": { "base_dir": scratch_dir } }}, { step_launcher_resource_key: { "config": { "scratch_dir": scratch_dir } } for step_launcher_resource_key in step_launcher_resource_keys }, ), }, )
def test_execute_on_celery_k8s_job_api_with_legacy_configmap_set( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): # Originally, jobs needed to include "dagster-pipeline-env" to pick up needed config when # using the helm chart - it's no longer needed, but verify that nothing breaks if it's included run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_job_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace, include_dagster_pipeline_env=True, ), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="demo_job_celery") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
def _composite_descent(parent_stack, solids_config_dict, resource_defs): """ The core implementation of composite_descent. This yields a stream of SolidConfigEntry. This is used by composite_descent to construct a dictionary. It descends over the entire solid hierarchy, constructing an entry for every handle. If it encounters a composite solid instance with a config mapping, it will invoke that config mapping fn, producing the config that is necessary to configure the child solids. This process unrolls recursively as you descend down the tree. """ for solid in parent_stack.current_container.solids: current_stack = parent_stack.descend(solid) current_handle = current_stack.handle current_solid_config = solids_config_dict.get(solid.name, {}) # the base case if isinstance(solid.definition, SolidDefinition): config_mapped_solid_config = solid.definition.apply_config_mapping( {"config": current_solid_config.get("config")}) if not config_mapped_solid_config.success: raise DagsterInvalidConfigError( "Error in config for solid {}".format(solid.name), config_mapped_solid_config.errors, config_mapped_solid_config, ) complete_config_object = merge_dicts( current_solid_config, config_mapped_solid_config.value) yield SolidConfigEntry( current_handle, SolidConfig.from_dict(complete_config_object)) continue graph_def = check.inst(solid.definition, GraphDefinition) yield SolidConfigEntry( current_handle, SolidConfig.from_dict({ "inputs": current_solid_config.get("inputs"), "outputs": current_solid_config.get("outputs"), }), ) # If there is a config mapping, invoke it and get the descendent solids # config that way. Else just grabs the solids entry of the current config solids_dict = (_get_mapped_solids_dict( solid, graph_def, current_stack, current_solid_config, resource_defs) if graph_def.config_mapping else current_solid_config.get("solids", {})) yield from _composite_descent(current_stack, solids_dict, resource_defs)
def __new__( cls, name: Optional[str] = None, resource_defs: Optional[Dict[str, ResourceDefinition]] = None, logger_defs: Optional[Dict[str, LoggerDefinition]] = None, executor_defs: Optional[List[ExecutorDefinition]] = None, description: Optional[str] = None, _config_mapping: Optional[ConfigMapping] = None, _partitioned_config: Optional["PartitionedConfig"] = None, ): from .partition import PartitionedConfig resource_defs = check.opt_dict_param(resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition) for key in resource_defs: if not key.isidentifier(): check.failed( f"Resource key '{key}' must be a valid Python identifier.") if resource_defs and "io_manager" in resource_defs: resource_defs_with_defaults = resource_defs else: from dagster.core.storage.mem_io_manager import mem_io_manager resource_defs_with_defaults = merge_dicts( {"io_manager": mem_io_manager}, resource_defs or {}) return super(ModeDefinition, cls).__new__( cls, name=check_valid_name(name) if name else DEFAULT_MODE_NAME, resource_defs=resource_defs_with_defaults, loggers=(check.opt_dict_param(logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition) or default_loggers()), executor_defs=check.list_param( executor_defs if executor_defs else default_executors, "executor_defs", of_type=ExecutorDefinition, ), description=check.opt_str_param(description, "description"), config_mapping=check.opt_inst_param(_config_mapping, "_config_mapping", ConfigMapping), partitioned_config=check.opt_inst_param(_partitioned_config, "_partitioned_config", PartitionedConfig), )
def test_merge(): # two element merge assert merge_dicts({}, {}) == {} assert merge_dicts({1: 2}, {}) == {1: 2} assert merge_dicts({}, {1: 2}) == {1: 2} assert merge_dicts({1: 1}, {1: 2}) == {1: 2} # three element merge assert merge_dicts({}, {}, {}) == {} assert merge_dicts({1: 2}, {2: 3}, {3: 4}) == {1: 2, 2: 3, 3: 4} assert merge_dicts({1: 2}, {1: 3}, {1: 4}) == {1: 4}
def test_execute_on_celery_k8s_with_env_var_and_termination( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, set_dagster_k8s_pipeline_run_namespace_env, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"}, ), ) _test_termination(dagit_url, dagster_instance, run_config)
def test_execute_on_celery_k8s_with_termination( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) _test_termination(dagit_url, dagster_instance, run_config)
def __new__( cls, name=None, resource_defs=None, logger_defs=None, executor_defs=None, description=None, intermediate_storage_defs=None, ): from dagster.core.storage.system_storage import default_intermediate_storage_defs from .intermediate_storage import IntermediateStorageDefinition check.opt_dict_param(resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition) if resource_defs and "io_manager" in resource_defs: resource_defs_with_defaults = resource_defs else: from dagster.core.storage.mem_io_manager import mem_io_manager resource_defs_with_defaults = merge_dicts( {"io_manager": mem_io_manager}, resource_defs or {}) return super(ModeDefinition, cls).__new__( cls, name=check_valid_name(name) if name else DEFAULT_MODE_NAME, resource_defs=resource_defs_with_defaults, loggers=(check.opt_dict_param(logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition) or default_loggers()), intermediate_storage_defs=check.list_param( intermediate_storage_defs if intermediate_storage_defs else default_intermediate_storage_defs, "intermediate_storage_defs", of_type=IntermediateStorageDefinition, ), executor_defs=check.list_param( executor_defs if executor_defs else default_executors, "executor_defs", of_type=ExecutorDefinition, ), description=check.opt_str_param(description, "description"), )
def get_celery_engine_config(dagster_docker_image, job_namespace): return { "execution": { "celery-k8s": { "config": merge_dicts( ({ "job_image": dagster_docker_image, } if dagster_docker_image else {}), { "job_namespace": job_namespace, "image_pull_policy": image_pull_policy(), }, ) } }, }
def test_execute_on_celery_k8s_retry_pipeline( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="retry_pipeline") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) stats = dagster_instance.get_run_stats(run_id) assert stats.steps_succeeded == 1 assert DagsterEventType.STEP_START in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_UP_FOR_RETRY in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_RESTARTED in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_SUCCESS in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ]
def test_execute_on_celery_k8s_with_resource_requirements( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="resources_limit_pipeline") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def test_docker_executor(): """ Note that this test relies on having AWS credentials in the environment. """ executor_config = { "execution": { "docker": { "config": { "networks": ["container:test-postgres-db-docker"], "env_vars": [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", ], } } } } docker_image = get_test_project_docker_image() if IS_BUILDKITE: executor_config["execution"]["docker"]["config"][ "registry" ] = get_buildkite_registry_config() else: find_local_test_image(docker_image) run_config = merge_dicts( merge_yamls( [ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ] ), executor_config, ) with environ({"DOCKER_LAUNCHER_NETWORK": "container:test-postgres-db-docker"}): with docker_postgres_instance() as instance: recon_pipeline = get_test_project_recon_pipeline("demo_pipeline_docker", docker_image) assert execute_pipeline( recon_pipeline, run_config=run_config, instance=instance ).success
def get_celery_job_engine_config(dagster_docker_image, job_namespace, include_dagster_pipeline_env=False): return { "execution": { "config": merge_dicts( ({ "job_image": dagster_docker_image, } if dagster_docker_image else {}), { "job_namespace": job_namespace, "image_pull_policy": image_pull_policy(), }, ({ "env_config_maps": ["dagster-pipeline-env"] } if include_dagster_pipeline_env else {}), ) }, }
def test_execute_subset_on_celery_k8s( # pylint: disable=redefined-outer-name dagster_docker_image, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_subset.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="demo_pipeline_celery", solid_selection=["count_letters"], ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def cli(self, command: str, **kwargs) -> DbtCliOutput: """ Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the default flags that were configured on resource initialization (if any) overriding the default values if necessary. Args: command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.) Returns: DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing parsed log output as well as the contents of run_results.json (if applicable). """ command = check.str_param(command, "command") extra_flags = {} if kwargs is None else kwargs # remove default flags that are declared as "strict" and not explicitly passed in default_flags = { k: v for k, v in self.default_flags.items() if not (k in self.strict_flags and k not in extra_flags) } flags = merge_dicts( default_flags, self._format_params(extra_flags, replace_underscores=True)) return execute_cli( executable=self._executable, command=command, flags_dict=flags, log=self.logger, warn_error=self._warn_error, ignore_handled_error=self._ignore_handled_error, target_path=self._target_path, )
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick, stream): pipeline_def = pipeline.get_definition() # Run should_execute and halt if it returns False schedule_context = ScheduleExecutionContext(instance) with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of should_execute for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): should_execute = schedule_def.should_execute(schedule_context) if not should_execute: # Update tick to skipped state and return tick.update_with_status(ScheduleTickStatus.SKIPPED) stream.send(ScheduledExecutionSkipped()) return errors = [] run_config = {} schedule_tags = {} try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of run_config_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): run_config = schedule_def.get_run_config(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of tags_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): schedule_tags = schedule_def.get_tags(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) pipeline_tags = pipeline_def.tags or {} check_tags(pipeline_tags, 'pipeline_tags') tags = merge_dicts(pipeline_tags, schedule_tags) mode = schedule_def.mode execution_plan_snapshot = None try: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, ) execution_plan_snapshot = snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()) except DagsterInvalidConfigError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=schedule_def.pipeline_name, run_id=None, run_config=run_config, mode=mode, solids_to_execute=pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), ) tick.update_with_status(ScheduleTickStatus.SUCCESS, run_id=possibly_invalid_pipeline_run.run_id) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return # Otherwise the run should be valid so lets launch it # Need an ExternalPipeline to launch so make one here recon_repo = pipeline.get_reconstructable_repository() repo_location = InProcessRepositoryLocation(recon_repo) external_pipeline = repo_location.get_repository( recon_repo.get_definition().name).get_full_external_pipeline( pipeline_def.name) try: launched_run = instance.launch_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except DagsterLaunchFailedError: error = serializable_error_info_from_exc_info(sys.exc_info()) instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[error])) return stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id)) return
def secretsmanager_secrets_resource(context): """Resource that provides a dict which maps selected SecretsManager secrets to their string values. Also optionally sets chosen secrets as environment variables. Example: .. code-block:: python import os from dagster import build_op_context, job, op from dagster_aws.secretsmanager import secretsmanager_secrets_resource @op(required_resource_keys={'secrets'}) def example_secretsmanager_secrets_op(context): return context.resources.secrets.get("my-secret-name") @op(required_resource_keys={'secrets'}) def example_secretsmanager_secrets_op_2(context): return os.getenv("my-other-secret-name") @job(resource_defs={'secrets': secretsmanager_secrets_resource}) def example_job(context): example_secretsmanager_secrets_op() example_secretsmanager_secrets_op_2() example_job.execute_in_process( run_config={ 'resources': { 'secrets': { 'config': { 'region_name': 'us-west-1', 'secrets_tag': 'dagster', 'add_to_environment': True, } } } } ) Note that your ops must also declare that they require this resource with `required_resource_keys`, or it will not be initialized for the execution of their compute functions. You may configure this resource as follows: .. code-block:: YAML resources: secretsmanager: config: region_name: "us-west-1" # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen # through the ordinary boto credential chain. profile_name: "dev" # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default # profile as specified in ~/.aws/credentials file secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"] # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager. secrets_tag: "dagster" # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled # from SecretsManager. add_to_environment: true # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults # to false. """ add_to_environment = check.bool_param( context.resource_config["add_to_environment"], "add_to_environment") secrets_tag = check.opt_str_param(context.resource_config["secrets_tag"], "secrets_tag") secrets = check.list_param(context.resource_config["secrets"], "secrets", of_type=str) secrets_manager = construct_secretsmanager_client( max_attempts=context.resource_config["max_attempts"], region_name=context.resource_config.get("region_name"), profile_name=context.resource_config.get("profile_name"), ) secret_arns = merge_dicts( (get_tagged_secrets(secrets_manager, [secrets_tag]) if secrets_tag else {}), get_secrets_from_arns(secrets_manager, secrets), ) secrets_map = { name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString") for name, arn in secret_arns.items() } with environ(secrets_map if add_to_environment else {}): yield secrets_map
@resource( merge_dicts( SECRETSMANAGER_SESSION_CONFIG, { "secrets": Field( Array(str), is_required=False, default_value=[], description=( "An array of AWS Secrets Manager secrets arns to fetch."), ), "secrets_tag": Field( Noneable(str), is_required=False, default_value=None, description= ("AWS Secrets Manager secrets with this tag will be fetched and made available." ), ), "add_to_environment": Field( bool, is_required=False, default_value=False, description=( "Whether to mount the secrets as environment variables."), ), }, )) @contextmanager
credential: sas: my_sas_token # str: the SAS token for the account. key: env: AZURE_DATA_LAKE_STORAGE_KEY # str: The shared access key for the account. ''' return _adls2_resource_from_config(context.resource_config) @resource( merge_dicts( ADLS2_CLIENT_CONFIG, { 'adls2_file_system': Field(StringSource, description='ADLS Gen2 file system name'), 'adls2_prefix': Field(StringSource, is_required=False, default_value='dagster'), }, )) def adls2_file_manager(context): adls2_client = _adls2_resource_from_config( context.resource_config).adls2_client return ADLS2FileManager( adls2_client=adls2_client, file_system=context.resource_config['adls2_file_system'], prefix=context.resource_config['adls2_prefix'], )
def test_container_context_on_pipeline(): docker_image = get_test_project_docker_image() launcher_config = {} if IS_BUILDKITE: launcher_config["registry"] = get_buildkite_registry_config() else: find_local_test_image(docker_image) executor_config = { "execution": { "docker": { "config": {} } }, } run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), executor_config, ) with docker_postgres_instance( overrides={ "run_launcher": { "class": "DockerRunLauncher", "module": "dagster_docker", "config": launcher_config, } }) as instance: recon_pipeline = get_test_project_recon_pipeline( "demo_pipeline_docker", docker_image, container_context={ "docker": { "env_vars": [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", ], "networks": ["container:test-postgres-db-docker"], "container_kwargs": { "auto_remove": True, "volumes": ["/var/run/docker.sock:/var/run/docker.sock"], }, } }, ) with get_test_project_workspace_and_external_pipeline( instance, "demo_pipeline_docker", container_image=docker_image) as ( workspace, orig_pipeline, ): external_pipeline = ReOriginatedExternalPipelineForTest( orig_pipeline, container_image=docker_image) run = instance.create_run_for_pipeline( pipeline_def=recon_pipeline.get_definition(), run_config=run_config, external_pipeline_origin=external_pipeline.get_external_origin( ), pipeline_code_origin=recon_pipeline.get_python_origin(), ) instance.launch_run(run.run_id, workspace) poll_for_finished_run(instance, run.run_id, timeout=60) for log in instance.all_logs(run.run_id): print(log) # pylint: disable=print-call assert instance.get_run_by_id( run.run_id).status == PipelineRunStatus.SUCCESS
} @resource( GCS_CLIENT_CONFIG, description="This resource provides a GCS client", ) def gcs_resource(init_context): return _gcs_client_from_config(init_context.resource_config) @resource( merge_dicts( GCS_CLIENT_CONFIG, { "gcs_bucket": Field(StringSource), "gcs_prefix": Field(StringSource, is_required=False, default_value="dagster"), }, ) ) def gcs_file_manager(context): """FileManager that provides abstract access to GCS. Implements the :py:class:`~dagster.core.storage.file_manager.FileManager` API. """ gcs_client = _gcs_client_from_config(context.resource_config) return GCSFileManager( client=gcs_client, gcs_bucket=context.resource_config["gcs_bucket"], gcs_base_key=context.resource_config["gcs_prefix"], )
credential: sas: my_sas_token # str: the SAS token for the account. key: env: AZURE_DATA_LAKE_STORAGE_KEY # str: The shared access key for the account. """ return _adls2_resource_from_config(context.resource_config) @resource( merge_dicts( ADLS2_CLIENT_CONFIG, { "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"), "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"), }, )) def adls2_file_manager(context): adls2_client = _adls2_resource_from_config( context.resource_config).adls2_client return ADLS2FileManager( adls2_client=adls2_client, file_system=context.resource_config["adls2_file_system"], prefix=context.resource_config["adls2_prefix"], )
def test_docker_monitoring(): docker_image = get_test_project_docker_image() launcher_config = { "env_vars": [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", ], "networks": ["container:test-postgres-db-docker"], "container_kwargs": { # "auto_remove": True, "volumes": ["/var/run/docker.sock:/var/run/docker.sock"], }, } if IS_BUILDKITE: launcher_config["registry"] = get_buildkite_registry_config() else: find_local_test_image(docker_image) run_config = merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "solids": { "multiply_the_word_slow": { "inputs": { "word": "bar" }, "config": { "factor": 2, "sleep_time": 20 }, } }, "execution": { "docker": { "config": {} } }, }, ) with docker_postgres_instance({ "run_monitoring": { "enabled": True }, "run_launcher": { "class": "DockerRunLauncher", "module": "dagster_docker", "config": launcher_config, }, }) as instance: recon_pipeline = get_test_project_recon_pipeline( "demo_pipeline_docker_slow", docker_image) with get_test_project_workspace_and_external_pipeline( instance, "demo_pipeline_docker_slow", container_image=docker_image) as ( workspace, orig_pipeline, ): with start_daemon(): external_pipeline = ReOriginatedExternalPipelineForTest( orig_pipeline, container_image=docker_image) run = instance.create_run_for_pipeline( pipeline_def=recon_pipeline.get_definition(), run_config=run_config, external_pipeline_origin=external_pipeline. get_external_origin(), pipeline_code_origin=external_pipeline.get_python_origin(), ) with log_run_events(instance, run.run_id): instance.launch_run(run.run_id, workspace) start_time = time.time() while time.time() - start_time < 60: run = instance.get_run_by_id(run.run_id) if run.status == PipelineRunStatus.STARTED: break assert run.status == PipelineRunStatus.STARTING time.sleep(1) time.sleep(3) instance.run_launcher._get_container( # pylint:disable=protected-access instance.get_run_by_id(run.run_id)).stop() # daemon resumes the run poll_for_finished_run(instance, run.run_id, timeout=90) assert instance.get_run_by_id( run.run_id).status == PipelineRunStatus.SUCCESS
def _launch_scheduled_execution(instance, repo_location, external_repo, external_schedule, tick, stream): pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=external_schedule.pipeline_name, solid_selection=external_schedule.solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result( pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) schedule_execution_data = repo_location.get_external_schedule_execution_data( instance=instance, repository_handle=external_repo.handle, schedule_name=external_schedule.name, schedule_execution_data_mode=ScheduleExecutionDataMode. LAUNCH_SCHEDULED_EXECUTION, scheduled_execution_time= None, # No way to know this in general for this scheduler ) run_config = {} schedule_tags = {} execution_plan_snapshot = None errors = [] if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData): error = schedule_execution_data.error tick.update_with_status(ScheduleTickStatus.FAILURE, error=error) stream.send(ScheduledExecutionFailed(run_id=None, errors=[error])) return elif not schedule_execution_data.should_execute: # Update tick to skipped state and return tick.update_with_status(ScheduleTickStatus.SKIPPED) stream.send(ScheduledExecutionSkipped()) return else: run_config = schedule_execution_data.run_config schedule_tags = schedule_execution_data.tags try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except errors.append(serializable_error_info_from_exc_info( sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, ) tick.update_with_status(ScheduleTickStatus.SUCCESS, run_id=possibly_invalid_pipeline_run.run_id) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return try: launched_run = instance.launch_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except Exception: # pylint: disable=broad-except stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[error])) return stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id)) return
def _launch_run(instance, repo_location, external_schedule, external_pipeline, tick_context, run_request): run_config = run_request.run_config schedule_tags = run_request.tags execution_plan_snapshot = None errors = [] try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except errors.append(serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), ) tick_context.add_run(run_id=possibly_invalid_pipeline_run.run_id, run_key=run_request.run_key) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) tick_context.stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return try: launched_run = instance.submit_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except Exception: # pylint: disable=broad-except tick_context.stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[serializable_error_info_from_exc_info(sys.exc_info())], )) return tick_context.stream.send( ScheduledExecutionSuccess(run_id=launched_run.run_id))
def start_scheduled_execution(graphene_info, schedule_name): ''' When a scheduler ticks and needs to run for a given schedule, it issues a START_SCHEDULED_EXECUTION mutation with just the schedule name. The mutation is resolved entirely by this method. ''' check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(schedule_name, 'schedule_name') tick = None try: # We first load the repository and schedule definition to create # and store a ScheduleTick. # If this fails, this error should be sent to the file based scheduler logs. external_repository = graphene_info.context.get_external_repository() repository_name = external_repository.name schedule_def = get_dagster_schedule_def(graphene_info, schedule_name) cron_schedule = "Unknown" if not schedule_def else schedule_def.cron_schedule tick = graphene_info.context.instance.create_schedule_tick( repository_name, ScheduleTickData( schedule_name=schedule_name, cron_schedule=cron_schedule, timestamp=time.time(), status=ScheduleTickStatus.STARTED, ), ) # Run should_execute and halt if it returns False schedule_context = ScheduleExecutionContext( graphene_info.context.instance) with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution should_execute for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): should_execute = schedule_def.should_execute(schedule_context) if not should_execute: # Update tick to skipped state and return tick = tick.with_status(ScheduleTickStatus.SKIPPED) graphene_info.context.instance.update_schedule_tick( repository_name, tick) # Return skipped specific gql response return graphene_info.schema.type_named( 'ScheduledExecutionBlocked' )(message= 'Schedule {schedule_name} did not run because the should_execute did not return' ' True'.format(schedule_name=schedule_name)) errors = [] environment_dict = {} schedule_tags = {} try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of environment_dict_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): environment_dict = schedule_def.get_environment_dict( schedule_context) except DagsterUserCodeExecutionError as exc: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of tags_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): schedule_tags = schedule_def.get_tags(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) external_pipeline = get_external_pipeline_or_raise( graphene_info, schedule_def.selector.name, schedule_def.selector.solid_subset) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, 'pipeline_tags') tags = merge_dicts(pipeline_tags, schedule_tags) selector = schedule_def.selector mode = schedule_def.mode execution_params = ExecutionParams( selector=selector, environment_dict=environment_dict, mode=mode, execution_metadata=ExecutionMetadata(tags=tags, run_id=None), step_keys=None, ) run, result = _execute_schedule(graphene_info, external_pipeline, execution_params, errors) graphene_info.context.instance.update_schedule_tick( repository_name, tick.with_status(ScheduleTickStatus.SUCCESS, run_id=run.run_id), ) return result except Exception as exc: # pylint: disable=broad-except error_data = serializable_error_info_from_exc_info(sys.exc_info()) if tick: graphene_info.context.instance.update_schedule_tick( repository_name, tick.with_status(ScheduleTickStatus.FAILURE, error=error_data), ) raise exc
def __new__( cls, name=None, resource_defs=None, logger_defs=None, system_storage_defs=None, executor_defs=None, description=None, intermediate_storage_defs=None, ): from dagster.core.storage.system_storage import ( default_system_storage_defs, default_intermediate_storage_defs, ) from .system_storage import SystemStorageDefinition from .intermediate_storage import IntermediateStorageDefinition if system_storage_defs is not None and intermediate_storage_defs is None: warnings.warn( "system_storage_defs are deprecated and will be removed in 0.10.0 " "and should be replaced with " "intermediate_storage_defs for intermediates and resource_defs for files" ) check.opt_dict_param(resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition) if resource_defs and "asset_store" in resource_defs: resource_defs_with_defaults = resource_defs else: from dagster.core.storage.asset_store import mem_asset_store resource_defs_with_defaults = merge_dicts( {"asset_store": mem_asset_store}, resource_defs or {}) return super(ModeDefinition, cls).__new__( cls, name=check_valid_name(name) if name else DEFAULT_MODE_NAME, resource_defs=resource_defs_with_defaults, loggers=(check.opt_dict_param(logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition) or default_loggers()), system_storage_defs=check.list_param( system_storage_defs if system_storage_defs else default_system_storage_defs, "system_storage_defs", of_type=SystemStorageDefinition, ), intermediate_storage_defs=check.list_param( intermediate_storage_defs if intermediate_storage_defs else default_intermediate_storage_defs, "intermediate_storage_defs", of_type=IntermediateStorageDefinition, ), executor_defs=check.list_param( executor_defs if executor_defs else default_executors, "executor_defs", of_type=ExecutorDefinition, ), description=check.opt_str_param(description, "description"), )