def recon_repo_for_cli_args(kwargs: Dict[str, str]): """Builds a ReconstructableRepository for CLI arguments, which can be any of the combinations for repo loading above. """ check.dict_param(kwargs, "kwargs") _cli_load_invariant(kwargs.get("pipeline_name") is None) if kwargs.get("workspace"): check.not_implemented( "Workspace not supported yet in this cli command") elif kwargs.get("module_name") and kwargs.get("fn_name"): _cli_load_invariant(kwargs.get("repository_yaml") is None) _cli_load_invariant(kwargs.get("python_file") is None) return ReconstructableRepository.for_module( kwargs["module_name"], kwargs["fn_name"], get_working_directory_from_kwargs(kwargs), ) elif kwargs.get("python_file") and kwargs.get("fn_name"): _cli_load_invariant(kwargs.get("repository_yaml") is None) _cli_load_invariant(kwargs.get("module_name") is None) return ReconstructableRepository.for_file( os.path.abspath(cast(str, kwargs["python_file"])), kwargs["fn_name"], get_working_directory_from_kwargs(kwargs), ) else: _cli_load_invariant(False)
def test_my_custom_operator( dagster_airflow_custom_operator_pipeline, caplog, ): # pylint: disable=redefined-outer-name caplog.set_level(logging.INFO, logger="CustomOperatorLogger") pipeline_name = "demo_pipeline_s3" operator = CustomOperator environments_path = get_test_project_environments_path() results = dagster_airflow_custom_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", pipeline_name), operator=operator, environment_yaml=[ os.path.join(environments_path, "env.yaml"), os.path.join(environments_path, "env_s3.yaml"), ], ) validate_pipeline_execution(results) log_lines = 0 for record in caplog.records: if record.name == "CustomOperatorLogger": log_lines += 1 assert record.message == "CustomOperator is called" assert log_lines == 2
def test_error_dag_containerized(dagster_docker_image): # pylint: disable=redefined-outer-name pipeline_name = "demo_error_pipeline_s3" recon_repo = ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo") environments_path = get_test_project_environments_path() environment_yaml = [ os.path.join(environments_path, "env_s3.yaml"), ] run_config = load_yaml_from_glob_list(environment_yaml) run_id = make_new_run_id() execution_date = timezone.utcnow() with postgres_instance() as instance: dag, tasks = make_airflow_dag_containerized_for_recon_repo( recon_repo, pipeline_name, dagster_docker_image, run_config, instance=instance, op_kwargs={"network_mode": "container:test-postgres-db-airflow"}, ) with pytest.raises(AirflowException) as exc_info: execute_tasks_in_dag(dag, tasks, run_id, execution_date) assert "Exception: Unusual error" in str(exc_info.value)
def test_airflow_execution_date_tags_job(): job_name = "demo_airflow_execution_date_job" recon_repo = ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", job_name ) environments_path = get_test_project_environments_path() environment_yaml = [ os.path.join(environments_path, "env_filesystem.yaml"), ] run_config = load_yaml_from_glob_list(environment_yaml) execution_date = timezone.utcnow() dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, job_name, run_config) results = execute_tasks_in_dag( dag, tasks, run_id=make_new_run_id(), execution_date=execution_date ) materialized_airflow_execution_date = None for result in results.values(): for event in result: if event.event_type_value == "ASSET_MATERIALIZATION": materialization = event.event_specific_data.materialization materialization_entry = materialization.metadata_entries[0] materialized_airflow_execution_date = materialization_entry.entry_data.text assert execution_date.isoformat() == materialized_airflow_execution_date
def recon_repository_from_origin(origin): check.inst_param(origin, "origin", RepositoryPythonOrigin) return ReconstructableRepository( origin.code_pointer, origin.container_image, origin.executable_path, origin.entry_point, origin.container_context, )
def test_skip_operator( dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = "optional_outputs" environments_path = get_test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", pipeline_name ), environment_yaml=[os.path.join(environments_path, "env_filesystem.yaml")], ) validate_skip_pipeline_execution(results)
def __init__(self, loadable_target_origin, entry_point): self._loadable_target_origin = loadable_target_origin self._code_pointers_by_repo_name = {} self._recon_repos_by_name = {} self._loadable_repository_symbols = [] if not loadable_target_origin: return loadable_targets = get_loadable_targets( loadable_target_origin.python_file, loadable_target_origin.module_name, loadable_target_origin.package_name, loadable_target_origin.working_directory, loadable_target_origin.attribute, ) for loadable_target in loadable_targets: pointer = _get_code_pointer(loadable_target_origin, loadable_target) recon_repo = ReconstructableRepository( pointer, _get_current_image(), sys.executable, entry_point=entry_point, ) repo_def = recon_repo.get_definition() # force load of all lazy constructed jobs/pipelines repo_def.get_all_pipelines() self._code_pointers_by_repo_name[repo_def.name] = pointer self._recon_repos_by_name[repo_def.name] = recon_repo self._loadable_repository_symbols.append( LoadableRepositorySymbol( attribute=loadable_target.attribute, repository_name=repo_def.name, ))
def test_fs_storage_no_explicit_base_dir( dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = "demo_pipeline" environments_path = get_test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", pipeline_name ), environment_yaml=[ os.path.join(environments_path, "env.yaml"), ], ) validate_pipeline_execution(results)
def test_skip_operator(dagster_airflow_docker_operator_pipeline, dagster_docker_image): # pylint: disable=redefined-outer-name pipeline_name = "optional_outputs" environments_path = get_test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo", ), environment_yaml=[ os.path.join(environments_path, "env_filesystem.yaml") ], op_kwargs={"host_tmp_dir": "/tmp"}, image=dagster_docker_image, ) validate_skip_pipeline_execution(results)
def test_s3_storage(dagster_airflow_docker_operator_pipeline, dagster_docker_image): # pylint: disable=redefined-outer-name pipeline_name = "demo_pipeline_s3" environments_path = get_test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo", ), environment_yaml=[ os.path.join(environments_path, "env.yaml"), os.path.join(environments_path, "env_s3.yaml"), ], image=dagster_docker_image, ) validate_pipeline_execution(results)
def test_error_dag_python_job(): job_name = "demo_error_job" recon_repo = ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", job_name ) environments_path = get_test_project_environments_path() environment_yaml = [ os.path.join(environments_path, "env_filesystem.yaml"), ] run_config = load_yaml_from_glob_list(environment_yaml) execution_date = timezone.utcnow() dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, job_name, run_config) with pytest.raises(AirflowException) as exc_info: execute_tasks_in_dag(dag, tasks, run_id=make_new_run_id(), execution_date=execution_date) assert "Exception: Unusual error" in str(exc_info.value)
def test_airflow_execution_date_tags_containerized(dagster_docker_image, ): # pylint: disable=redefined-outer-name, unused-argument pipeline_name = "demo_airflow_execution_date_pipeline_s3" recon_repo = ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo") environments_path = get_test_project_environments_path() environment_yaml = [ os.path.join(environments_path, "env_s3.yaml"), ] run_config = load_yaml_from_glob_list(environment_yaml) execution_date = timezone.utcnow() with postgres_instance() as instance: dag, tasks = make_airflow_dag_containerized_for_recon_repo( recon_repo, pipeline_name, dagster_docker_image, run_config, instance=instance, op_kwargs={"network_mode": "container:test-postgres-db-airflow"}, ) results = execute_tasks_in_dag(dag, tasks, run_id=make_new_run_id(), execution_date=execution_date) materialized_airflow_execution_date = None for result in results.values(): for event in result: if event.event_type_value == "ASSET_MATERIALIZATION": materialization = event.event_specific_data.materialization materialization_entry = materialization.metadata_entries[0] materialized_airflow_execution_date = materialization_entry.entry_data.text assert execution_date.isoformat( ) == materialized_airflow_execution_date
def test_run_finished(self, graphql_context): instance = graphql_context.instance pipeline = ReconstructableRepository.for_file( file_relative_path(__file__, "setup.py"), "test_repo", ).get_reconstructable_pipeline("noop_pipeline") pipeline_result = execute_pipeline(pipeline, instance=instance) assert pipeline_result.success assert pipeline_result.run_id time.sleep(0.05) # guarantee execution finish result = execute_dagster_graphql( graphql_context, RUN_CANCELLATION_QUERY, variables={"runId": pipeline_result.run_id}) assert result.data["terminatePipelineExecution"][ "__typename"] == "TerminateRunFailure" assert ("could not be terminated due to having status SUCCESS." in result.data["terminatePipelineExecution"]["message"]) # Still fails even if you change the terminate policy to fail immediately result = execute_dagster_graphql( graphql_context, RUN_CANCELLATION_QUERY, variables={ "runId": pipeline_result.run_id, "terminatePolicy": "MARK_AS_CANCELED_IMMEDIATELY", }, ) assert result.data["terminatePipelineExecution"][ "__typename"] == "TerminateRunFailure" assert ("could not be terminated due to having status SUCCESS." in result.data["terminatePipelineExecution"]["message"])
def step_context_to_step_run_ref( step_context: StepExecutionContext, prior_attempts_count: int, package_dir: Optional[str] = None, ) -> StepRunRef: """ Args: step_context (StepExecutionContext): The step context. prior_attempts_count (int): The number of times this time has been tried before in the same pipeline run. package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted to be relative a module pointer relative to the package root. This enables executing steps in remote setups where the package containing the pipeline resides at a different location on the filesystem in the remote environment than in the environment executing the plan process. Returns (StepRunRef): A reference to the step. """ check.inst_param(step_context, "step_context", StepExecutionContext) check.int_param(prior_attempts_count, "prior_attempts_count") retry_mode = step_context.retry_mode recon_pipeline = step_context.pipeline if package_dir: if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance( recon_pipeline.repository.pointer, FileCodePointer ): recon_pipeline = ReconstructablePipeline( repository=ReconstructableRepository( pointer=ModuleCodePointer( _module_in_package_dir( recon_pipeline.repository.pointer.python_file, package_dir ), recon_pipeline.repository.pointer.fn_name, working_directory=os.getcwd(), ), container_image=recon_pipeline.repository.container_image, executable_path=recon_pipeline.repository.executable_path, entry_point=recon_pipeline.repository.entry_point, container_context=recon_pipeline.repository.container_context, ), pipeline_name=recon_pipeline.pipeline_name, solids_to_execute=recon_pipeline.solids_to_execute, ) upstream_output_events, run_group = _upstream_events_and_runs(step_context) return StepRunRef( run_config=step_context.run_config, pipeline_run=step_context.pipeline_run, run_id=step_context.pipeline_run.run_id, step_key=step_context.step.key, retry_mode=retry_mode, recon_pipeline=recon_pipeline, # type: ignore prior_attempts_count=prior_attempts_count, known_state=step_context.execution_plan.known_state, run_group=run_group, upstream_output_events=upstream_output_events, )
def create_main_recon_repo(): return ReconstructableRepository.for_file(__file__, main_repo_name())
def make_airflow_dag( module_name, job_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None, ): """Construct an Airflow DAG corresponding to a given Dagster job/pipeline. Tasks in the resulting DAG will execute the Dagster logic they encapsulate as a Python callable, run by an underlying :py:class:`PythonOperator <airflow:PythonOperator>`. As a consequence, both dagster, any Python dependencies required by your solid logic, and the module containing your pipeline definition must be available in the Python environment within which your Airflow tasks execute. If you cannot install requirements into this environment, or you are looking for a containerized solution to provide better isolation, see instead :py:func:`make_airflow_dag_containerized`. This function should be invoked in an Airflow DAG definition file, such as that created by an invocation of the dagster-airflow scaffold CLI tool. Args: module_name (str): The name of the importable module in which the pipeline/job definition can be found. job_name (str): The name of the job definition. run_config (Optional[dict]): The config, if any, with which to compile the pipeline/job to an execution plan, as a Python dict. mode (Optional[str]): The mode in which to execute the pipeline. instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline/job. dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`). dag_description (Optional[str]): The description to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`) dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``. op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow operator (a subclass of :py:class:`PythonOperator <airflow:airflow.operators.python_operator.PythonOperator>`). pipeline_name (str): (legacy) The name of the pipeline definition. Returns: (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a list of its constituent tasks. """ check.str_param(module_name, "module_name") job_name = canonicalize_backcompat_args( new_val=job_name, new_arg="job_name", old_val=pipeline_name, old_arg="pipeline_name", breaking_version="future versions", coerce_old_to_new=lambda val: val, ) recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd()) return _make_airflow_dag( recon_repo=recon_repo, job_name=job_name, run_config=run_config, mode=mode, instance=instance, dag_id=dag_id, dag_description=dag_description, dag_kwargs=dag_kwargs, op_kwargs=op_kwargs, )
def make_airflow_dag_containerized( module_name, job_name, image, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, pipeline_name=None, ): """Construct a containerized Airflow DAG corresponding to a given Dagster job/pipeline. Tasks in the resulting DAG will execute the Dagster logic they encapsulate using a subclass of :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`. As a consequence, both dagster, any Python dependencies required by your solid logic, and the module containing your pipeline definition must be available in the container spun up by this operator. Typically you'll want to install these requirements onto the image you're using. This function should be invoked in an Airflow DAG definition file, such as that created by an invocation of the dagster-airflow scaffold CLI tool. Args: module_name (str): The name of the importable module in which the pipeline/job definition can be found. job_name (str): The name of the job definition. image (str): The name of the Docker image to use for execution (passed through to :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`). run_config (Optional[dict]): The config, if any, with which to compile the pipeline/job to an execution plan, as a Python dict. mode (Optional[str]): The mode in which to execute the pipeline. dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`). dag_description (Optional[str]): The description to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`) dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``. op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow operator (a subclass of :py:class:`DockerOperator <airflow:airflow.operators.docker_operator.DockerOperator>`). pipeline_name (str): (legacy) The name of the pipeline definition. Returns: (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a list of its constituent tasks. """ check.str_param(module_name, "module_name") check.str_param(job_name, "job_name") check.str_param(image, "image") check.opt_dict_param(run_config, "run_config") check.opt_str_param(mode, "mode") check.opt_str_param(dag_id, "dag_id") check.opt_str_param(dag_description, "dag_description") check.opt_dict_param(dag_kwargs, "dag_kwargs") check.opt_dict_param(op_kwargs, "op_kwargs") job_name = canonicalize_backcompat_args( new_val=job_name, new_arg="job_name", old_val=pipeline_name, old_arg="pipeline_name", breaking_version="future versions", coerce_old_to_new=lambda val: val, ) recon_repo = ReconstructableRepository.for_module(module_name, job_name, os.getcwd()) op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str) op_kwargs["image"] = image return _make_airflow_dag( recon_repo=recon_repo, job_name=job_name, run_config=run_config, mode=mode, dag_id=dag_id, dag_description=dag_description, dag_kwargs=dag_kwargs, op_kwargs=op_kwargs, operator=DagsterDockerOperator, )
return nonce_solid() @repository def my_repository(): return [nonce_pipeline] nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot() nonce_execution_plan_snapshot = snapshot_from_execution_plan( create_execution_plan(nonce_pipeline), nonce_pipeline.get_pipeline_snapshot_id() ) recon_repo_for_tests = ReconstructableRepository.for_file( file_relative_path(__file__, "test_dagster_docker_operator.py"), "my_repository", ) def test_init_modified_docker_operator(dagster_docker_image): with instance_for_test() as instance: dagster_operator_parameters = DagsterOperatorParameters( task_id="nonce", pipeline_name="nonce_pipeline", mode="default", op_kwargs={ "image": dagster_docker_image, "api_version": "auto", }, pipeline_snapshot=nonce_pipeline_snapshot, execution_plan_snapshot=nonce_execution_plan_snapshot,