예제 #1
0
파일: impl.py 프로젝트: naralogics/dagster
def get_external_executable_params(recon_repo, external_executable_args):
    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)
    check.inst_param(
        external_executable_args,
        "external_executable_args",
        ExternalExecutableArgs,
    )
    definition = recon_repo.get_definition()
    executable_def = definition.get_executable_def(
        external_executable_args.name)
    with DagsterInstance.from_ref(
            external_executable_args.instance_ref) as instance:
        context = ExecutableContext(instance)

        try:
            with user_code_error_boundary(
                    ExecutableError,
                    lambda:
                    "Error occured during the execution of run_config_fn for triggered "
                    "execution {name}".format(name=executable_def.name),
            ):
                run_config = executable_def.get_run_config(context)

            with user_code_error_boundary(
                    ExecutableError,
                    lambda:
                    "Error occured during the execution of tags_fn for triggered "
                    "execution {name}".format(name=executable_def.name),
            ):
                tags = executable_def.get_tags(context)

            return ExternalExecutionParamsData(run_config=run_config,
                                               tags=tags)
        except ExecutableError:
            return ExternalExecutionParamsErrorData(
                serializable_error_info_from_exc_info(sys.exc_info()))
예제 #2
0
    def start_run(self, execute_run_args):
        check.inst_param(execute_run_args, 'execute_run_args', ExecuteRunArgs)

        try:
            instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
            pipeline_run = instance.get_run_by_id(
                execute_run_args.pipeline_run_id)
            res = self._query(
                'StartRun',
                api_pb2.StartRunRequest,
                serialized_execute_run_args=serialize_dagster_namedtuple(
                    execute_run_args),
            )
            return deserialize_json_to_dagster_namedtuple(
                res.serialized_start_run_result)

        except Exception:  # pylint: disable=bare-except
            instance.report_engine_event(
                message='Unexpected error in IPC client',
                pipeline_run=pipeline_run,
                engine_event_data=EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
            )
            raise
예제 #3
0
def test_event_log_step_key_migration():
    src_dir = file_relative_path(
        __file__, "snapshot_0_7_6_pre_event_log_migration/sqlite")
    with copy_directory(src_dir) as test_dir:
        instance = DagsterInstance.from_ref(InstanceRef.from_dir(test_dir))

        # Make sure the schema is migrated
        instance.upgrade()

        runs = instance.get_runs()
        assert len(runs) == 1
        run_ids = instance._event_storage.get_all_run_ids()
        assert run_ids == ["6405c4a0-3ccc-4600-af81-b5ee197f8528"]
        assert isinstance(instance._event_storage, SqlEventLogStorage)
        events_by_id = instance._event_storage.get_logs_for_run_by_log_id(
            "6405c4a0-3ccc-4600-af81-b5ee197f8528")
        assert len(events_by_id) == 40

        step_key_records = []
        for record_id, _event in events_by_id.items():
            row_data = instance._event_storage.get_event_log_table_data(
                "6405c4a0-3ccc-4600-af81-b5ee197f8528", record_id)
            if row_data.step_key is not None:
                step_key_records.append(row_data)
        assert len(step_key_records) == 0

        # run the event_log backfill migration
        migrate_event_log_data(instance=instance)

        step_key_records = []
        for record_id, _event in events_by_id.items():
            row_data = instance._event_storage.get_event_log_table_data(
                "6405c4a0-3ccc-4600-af81-b5ee197f8528", record_id)
            if row_data.step_key is not None:
                step_key_records.append(row_data)
        assert len(step_key_records) > 0
예제 #4
0
def test_0_6_6_sqlite_exc():
    test_dir = file_relative_path(__file__, 'snapshot_0_6_6/sqlite')
    with restore_directory(test_dir):
        instance = DagsterInstance.from_ref(InstanceRef.from_dir(test_dir))
        runs = instance.get_runs()
        # Note that this is a deliberate choice -- old runs are simply invisible, and their
        # presence won't raise DagsterInstanceMigrationRequired. This is a reasonable choice since
        # the runs.db has moved and otherwise we would have to do a check for the existence of an
        # old runs.db every time we accessed the runs. Instead, we'll do this only in the upgrade
        # method.
        assert len(runs) == 0

        run_ids = instance._event_storage.get_all_run_ids()
        assert run_ids == ['89296095-892d-4a15-aa0d-9018d1580945']

        with pytest.raises(
                DagsterInstanceMigrationRequired,
                match=re.escape(
                    'Instance is out of date and must be migrated (SqliteEventLogStorage for run '
                    '89296095-892d-4a15-aa0d-9018d1580945). Database is at revision None, head is '
                    '3b1e175a2be3. Please run `dagster instance migrate`.'),
        ):
            instance._event_storage.get_logs_for_run(
                '89296095-892d-4a15-aa0d-9018d1580945')
예제 #5
0
파일: impl.py 프로젝트: varokas/dagster-1
def get_external_schedule_execution(external_schedule_execution_args):
    check.inst_param(
        external_schedule_execution_args,
        'external_schedule_execution_args',
        ExternalScheduleExecutionArgs,
    )

    recon_repo = recon_repository_from_origin(external_schedule_execution_args.repository_origin)
    definition = recon_repo.get_definition()
    schedule_def = definition.get_schedule_def(external_schedule_execution_args.schedule_name)
    instance = DagsterInstance.from_ref(external_schedule_execution_args.instance_ref)
    schedule_context = ScheduleExecutionContext(instance)
    try:
        with user_code_error_boundary(
            ScheduleExecutionError,
            lambda: 'Error occurred during the execution of run_config_fn for schedule '
            '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            run_config = schedule_def.get_run_config(schedule_context)
            return ExternalScheduleExecutionData(run_config=run_config)
    except ScheduleExecutionError:
        return ExternalScheduleExecutionErrorData(
            serializable_error_info_from_exc_info(sys.exc_info())
        )
예제 #6
0
    def execute(self):
        check.inst(self.executor_config, MultiprocessExecutorConfig)
        pipeline = self.executor_config.pipeline
        instance = DagsterInstance.from_ref(self.instance_ref)

        start_termination_thread(self.term_event)

        execution_plan = create_execution_plan(
            pipeline=pipeline,
            environment_dict=self.environment_dict,
            mode=self.pipeline_run.mode,
            step_keys_to_execute=self.pipeline_run.step_keys_to_execute,
        ).build_subset_plan([self.step_key])

        yield instance.report_engine_event(
            'Executing step {} in subprocess'.format(self.step_key),
            self.pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(str(os.getpid()), 'pid'),
                    EventMetadataEntry.text(self.step_key, 'step_key'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            MultiprocessEngine,
            self.step_key,
        )

        for step_event in execute_plan_iterator(
                execution_plan,
                self.pipeline_run,
                environment_dict=self.environment_dict,
                retries=self.executor_config.retries.for_inner_plan(),
                instance=instance,
        ):
            yield step_event
예제 #7
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            raise DagstermillError(
                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
            ) from err

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=pipeline_run.mode)

        execution_plan = ExecutionPlan.build(
            self.pipeline,
            environment_config,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
예제 #8
0
    def __init__(
        self,
        task_id,
        environment_dict=None,
        pipeline_name=None,
        mode=None,
        step_keys=None,
        dag=None,
        instance_ref=None,
        *args,
        **kwargs
    ):
        check.str_param(pipeline_name, 'pipeline_name')
        step_keys = check.opt_list_param(step_keys, 'step_keys', of_type=str)
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.opt_inst_param(instance_ref, 'instance_ref', InstanceRef)

        tmp_dir = kwargs.pop('tmp_dir', DOCKER_TEMPDIR)
        host_tmp_dir = kwargs.pop('host_tmp_dir', seven.get_system_temp_directory())

        if not environment_dict.get('storage'):
            raise AirflowException(
                'No storage config found -- must configure storage for '
                'the DagsterDockerOperator. Ex.: \n'
                'storage:\n'
                '  filesystem:\n'
                '    config:'
                '      base_dir: \'/some/shared/volume/mount/special_place\''
                '\n\n --or--\n\n'
                'storage:\n'
                '  s3:\n'
                '    s3_bucket: \'my-s3-bucket\'\n'
                '\n\n --or--\n\n'
                'storage:\n'
                '  gcs:\n'
                '    gcs_bucket: \'my-gcs-bucket\'\n'
            )

        if 'filesystem' in environment_dict['storage']:
            if (
                'config' in (environment_dict['storage'].get('filesystem', {}) or {})
                and 'base_dir'
                in (
                    (environment_dict['storage'].get('filesystem', {}) or {}).get('config', {})
                    or {}
                )
                and environment_dict['storage']['filesystem']['config']['base_dir'] != tmp_dir
            ):
                warnings.warn(
                    'Found base_dir \'{base_dir}\' set in filesystem storage config, which was not '
                    'the tmp_dir we expected (\'{tmp_dir}\', mounting host_tmp_dir '
                    '\'{host_tmp_dir}\' from the host). We assume you know what you are doing, but '
                    'if you are having trouble executing containerized workloads, this may be the '
                    'issue'.format(
                        base_dir=environment_dict['storage']['filesystem']['config']['base_dir'],
                        tmp_dir=tmp_dir,
                        host_tmp_dir=host_tmp_dir,
                    )
                )
            else:
                environment_dict['storage']['filesystem'] = dict(
                    environment_dict['storage']['filesystem'] or {},
                    **{
                        'config': dict(
                            (
                                (environment_dict['storage'].get('filesystem', {}) or {}).get(
                                    'config', {}
                                )
                                or {}
                            ),
                            **{'base_dir': tmp_dir}
                        )
                    }
                )

        self.docker_conn_id_set = kwargs.get('docker_conn_id') is not None
        self.environment_dict = environment_dict
        self.pipeline_name = pipeline_name
        self.mode = mode
        self.step_keys = step_keys
        self._run_id = None
        # self.instance might be None in, for instance, a unit test setting where the operator
        # was being directly instantiated without passing through make_airflow_dag
        self.instance = DagsterInstance.from_ref(instance_ref) if instance_ref else None

        # These shenanigans are so we can override DockerOperator.get_hook in order to configure
        # a docker client using docker.from_env, rather than messing with the logic of
        # DockerOperator.execute
        if not self.docker_conn_id_set:
            try:
                from_env().version()
            except Exception:  # pylint: disable=broad-except
                pass
            else:
                kwargs['docker_conn_id'] = True

        # We do this because log lines won't necessarily be emitted in order (!) -- so we can't
        # just check the last log line to see if it's JSON.
        kwargs['xcom_all'] = True

        # Store Airflow DAG run timestamp so that we can pass along via execution metadata
        self.airflow_ts = kwargs.get('ts')

        if 'environment' not in kwargs:
            kwargs['environment'] = get_aws_environment()

        super(DagsterDockerOperator, self).__init__(
            task_id=task_id, dag=dag, tmp_dir=tmp_dir, host_tmp_dir=host_tmp_dir, *args, **kwargs
        )
예제 #9
0
def execute_inner(step_key, pipeline_run, instance_ref):
    instance = DagsterInstance.from_ref(instance_ref)
    inner_step(instance, pipeline_run, step_key)
예제 #10
0
파일: impl.py 프로젝트: coderanger/dagster
def _run_in_subprocess(
    serialized_execute_run_args,
    recon_pipeline,
    termination_event,
    subprocess_status_handler,
    run_event_handler,
):

    start_termination_thread(termination_event)
    try:
        execute_run_args = deserialize_json_to_dagster_namedtuple(
            serialized_execute_run_args)
        check.inst_param(execute_run_args, "execute_run_args",
                         ExecuteExternalPipelineArgs)

        instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)

        if not pipeline_run:
            raise DagsterRunNotFoundError(
                "gRPC server could not load run {run_id} in order to execute it. Make sure that the gRPC server has access to your run storage."
                .format(run_id=execute_run_args.pipeline_run_id),
                invalid_run_id=execute_run_args.pipeline_run_id,
            )

        pid = os.getpid()

    except:  # pylint: disable=bare-except
        serializable_error_info = serializable_error_info_from_exc_info(
            sys.exc_info())
        event = IPCErrorMessage(
            serializable_error_info=serializable_error_info,
            message="Error during RPC setup for executing run: {message}".
            format(message=serializable_error_info.message),
        )
        subprocess_status_handler(event)
        subprocess_status_handler(RunInSubprocessComplete())
        if instance:
            instance.dispose()
        return

    subprocess_status_handler(StartRunInSubprocessSuccessful())

    run_event_handler(
        instance.report_engine_event(
            "Started process for pipeline (pid: {pid}).".format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid,
                                       marker_end="cli_api_subprocess_init"),
        ))

    # This is so nasty but seemingly unavoidable
    # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/
    closed = False
    try:
        for event in _core_execute_run(recon_pipeline, pipeline_run, instance):
            run_event_handler(event)
    except GeneratorExit:
        closed = True
        raise
    finally:
        if not closed:
            run_event_handler(
                instance.report_engine_event(
                    "Process for pipeline exited (pid: {pid}).".format(
                        pid=pid),
                    pipeline_run,
                ))
        subprocess_status_handler(RunInSubprocessComplete())
        instance.dispose()
예제 #11
0
파일: client.py 프로젝트: sd2k/dagster
    def execute_run(self, execute_run_args):
        check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs)

        with DagsterInstance.from_ref(
                execute_run_args.instance_ref) as instance:
            try:
                pipeline_run = instance.get_run_by_id(
                    execute_run_args.pipeline_run_id)
                event_iterator = self._streaming_query(
                    "ExecuteRun",
                    api_pb2.ExecuteRunRequest,
                    serialized_execute_run_args=serialize_dagster_namedtuple(
                        execute_run_args),
                )
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc

            try:
                for event in event_iterator:
                    yield deserialize_json_to_dagster_namedtuple(
                        event.serialized_dagster_event_or_ipc_error_message)
            except KeyboardInterrupt:
                self.cancel_execution(
                    CancelExecutionRequest(
                        run_id=execute_run_args.pipeline_run_id))
                raise
            except grpc.RpcError as rpc_error:
                if (
                        # posix
                        "Socket closed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                        # windows
                        or "Stream removed" in rpc_error.debug_error_string()  # pylint: disable=no-member
                ):
                    yield instance.report_engine_event(
                        message=
                        "User process: GRPC server for {run_id} terminated unexpectedly"
                        .format(run_id=pipeline_run.run_id),
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                    yield instance.report_run_failed(pipeline_run)
                else:
                    yield instance.report_engine_event(
                        message="Unexpected error in IPC client",
                        pipeline_run=pipeline_run,
                        engine_event_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                    )
                raise rpc_error
            except Exception as exc:  # pylint: disable=bare-except
                yield instance.report_engine_event(
                    message="Unexpected error in IPC client",
                    pipeline_run=pipeline_run,
                    engine_event_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())),
                )
                raise exc
예제 #12
0
def test_init_compute_log_with_bad_config_override():
    with seven.TemporaryDirectory() as tmpdir_path:
        with pytest.raises(DagsterInvalidConfigError, match='Undefined field "garbage"'):
            DagsterInstance.from_ref(
                InstanceRef.from_dir(tmpdir_path, overrides={'compute_logs': {'garbage': 'flargh'}})
            )
예제 #13
0
def test_backcompat_asset_materializations():
    src_dir = file_relative_path(
        __file__, "compat_tests/snapshot_0_11_0_asset_materialization")
    # should contain materialization events for asset keys a, b, c, d, e, f
    # events a and b have been wiped, but b has been rematerialized

    @op
    def materialize():
        yield AssetMaterialization(AssetKey("c"), tags={"foo": "bar"})
        yield Output(None)

    @job
    def my_job():
        materialize()

    def _validate_materialization(asset_key, event, expected_tags):
        assert isinstance(event, EventLogEntry)
        assert event.dagster_event
        assert event.dagster_event.is_step_materialization
        assert event.dagster_event.step_materialization_data.materialization.asset_key == asset_key
        assert event.dagster_event.step_materialization_data.materialization.tags == expected_tags

    a = AssetKey("a")
    b = AssetKey("b")
    c = AssetKey("c")

    with copy_directory(src_dir) as test_dir:
        with DagsterInstance.from_ref(
                InstanceRef.from_dir(test_dir)) as instance:
            storage = instance.event_log_storage

            a_mat = storage.get_latest_materialization_events([a]).get(a)
            assert a_mat is None

            b_mat = storage.get_latest_materialization_events([b]).get(b)
            _validate_materialization(b, b_mat, expected_tags={})

            c_mat = storage.get_latest_materialization_events([c]).get(c)
            _validate_materialization(c, c_mat, expected_tags={})

            mat_by_key = storage.get_latest_materialization_events([a, b, c])
            assert mat_by_key.get(a) is None
            _validate_materialization(b, mat_by_key.get(b), expected_tags={})
            _validate_materialization(c, mat_by_key.get(c), expected_tags={})

            # materialize c with tags
            my_job.execute_in_process(instance=instance)

            a_mat = storage.get_latest_materialization_events([a]).get(a)
            assert a_mat is None

            b_mat = storage.get_latest_materialization_events([b]).get(b)
            _validate_materialization(b, b_mat, expected_tags={})

            c_mat = storage.get_latest_materialization_events([c]).get(c)
            _validate_materialization(c, c_mat, expected_tags={"foo": "bar"})

            mat_by_key = storage.get_latest_materialization_events([a, b, c])
            assert mat_by_key.get(a) is None
            _validate_materialization(b, mat_by_key.get(b), expected_tags={})
            _validate_materialization(c, c_mat, expected_tags={"foo": "bar"})
예제 #14
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(executable_dict, 'executable_dict')
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        pipeline = InterProcessExecutablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        execution_plan = create_execution_plan(
            self.pipeline,
            environment_dict,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                environment_dict,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=None,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def),
            )

        return self.context
예제 #15
0
    def in_mp_process(cls, handle, pipeline_run, instance_ref, term_event):
        """
        Execute pipeline using message queue as a transport
        """
        run_id = pipeline_run.run_id
        pipeline_name = pipeline_run.pipeline_name

        instance = DagsterInstance.from_ref(instance_ref)
        pid = os.getpid()
        instance.report_engine_event(
            'Started process for pipeline (pid: {pid}).'.format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid, marker_end='dagit_subprocess_init'),
            cls,
        )

        start_termination_thread(term_event)

        try:
            handle.build_repository_definition()
            pipeline_def = handle.with_pipeline_name(pipeline_name).build_pipeline_definition()
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'Failed attempting to load pipeline "{}"'.format(pipeline_name),
                pipeline_run,
                EngineEventData.engine_error(serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
            return

        try:
            event_list = []
            for event in execute_run_iterator(
                pipeline_def.build_sub_pipeline(pipeline_run.selector.solid_subset),
                pipeline_run,
                instance,
            ):
                event_list.append(event)
            return PipelineExecutionResult(pipeline_def, run_id, event_list, lambda: None)

        # Add a DagsterEvent for unexpected exceptions
        # Explicitly ignore KeyboardInterrupts since they are used for termination
        except DagsterSubprocessError as err:
            if not all(
                [
                    err_info.cls_name == 'KeyboardInterrupt'
                    for err_info in err.subprocess_error_infos
                ]
            ):
                instance.report_engine_event(
                    'An exception was thrown during execution that is likely a framework error, '
                    'rather than an error in user code.',
                    pipeline_run,
                    EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(sys.exc_info())
                    ),
                    cls,
                )
        except Exception:  # pylint: disable=broad-except
            instance.report_engine_event(
                'An exception was thrown during execution that is likely a framework error, '
                'rather than an error in user code.',
                pipeline_run,
                EngineEventData.engine_error(serializable_error_info_from_exc_info(sys.exc_info())),
                cls,
            )
        finally:
            instance.report_engine_event(
                'Process for pipeline exited (pid: {pid}).'.format(pid=pid), pipeline_run, cls=cls,
            )
예제 #16
0
파일: server.py 프로젝트: cadet702/dagster
    def StartRun(self, request, _context):
        execute_run_args = check.inst(
            deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args),
            ExecuteRunArgs,
        )

        try:
            execute_run_args = check.inst(
                deserialize_json_to_dagster_namedtuple(
                    request.serialized_execute_run_args),
                ExecuteRunArgs,
            )

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=None,
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                    )))

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=start_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )

        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                DagsterInstance.from_ref(execute_run_args.instance_ref),
            )
            self._termination_events[run_id] = termination_event

        success = None
        message = None
        serializable_error_info = None

        while success is None:
            time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            # We use `get_nowait()` instead of `get()` so that we can handle the case where the
            # execution process has died unexpectedly -- `get()` would hang forever in that case
            try:
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    success = False
                    message = (
                        "GRPC server: Subprocess for {run_id} terminated unexpectedly with "
                        "exit code {exit_code}".format(
                            run_id=run_id,
                            exit_code=execution_process.exitcode,
                        ))
                    serializable_error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              StartRunInSubprocessSuccessful):
                    success = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                RunInSubprocessComplete):
                    continue
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              IPCErrorMessage):
                    success = False
                    message = dagster_event_or_ipc_error_message_or_done.message
                    serializable_error_info = (
                        dagster_event_or_ipc_error_message_or_done.
                        serializable_error_info)

        # Ensure that if the run failed, we remove it from the executions map before
        # returning so that CanCancel will never return True
        if not success:
            self._check_for_orphaned_runs()

        return api_pb2.StartRunReply(
            serialized_start_run_result=serialize_dagster_namedtuple(
                StartRunResult(
                    success=success,
                    message=message,
                    serializable_error_info=serializable_error_info,
                )))
예제 #17
0
def test_init_compute_log_with_bad_config():
    with seven.TemporaryDirectory() as tmpdir_path:
        with open(os.path.join(tmpdir_path, 'dagster.yaml'), 'w') as fd:
            yaml.dump({'compute_logs': {'garbage': 'flargh'}}, fd, default_flow_style=False)
        with pytest.raises(DagsterInvalidConfigError, match='Undefined field "garbage"'):
            DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))
예제 #18
0
 def _start_pipeline_execution(self, job_args):
     handle = job_args['handle']
     pipeline_run = job_args['pipeline_run']
     pipeline = handle.build_repository_definition().get_pipeline(pipeline_run.pipeline_name)
     instance = DagsterInstance.from_ref(job_args['instance_ref'])
     self._delegate.execute_pipeline(handle, pipeline, pipeline_run, instance)
예제 #19
0
    def __init__(self, dagster_operator_parameters, *args):
        kwargs = dagster_operator_parameters.op_kwargs
        tmp_dir = kwargs.pop("tmp_dir", DOCKER_TEMPDIR)
        host_tmp_dir = kwargs.pop("host_tmp_dir",
                                  seven.get_system_temp_directory())
        self.host_tmp_dir = host_tmp_dir

        run_config = dagster_operator_parameters.run_config
        if "filesystem" in run_config["storage"]:
            if ("config" in (run_config["storage"].get("filesystem", {}) or {})
                    and "base_dir" in ((run_config["storage"].get(
                        "filesystem", {}) or {}).get("config", {}) or {}) and
                    run_config["storage"]["filesystem"]["config"]["base_dir"]
                    != tmp_dir):
                warnings.warn(
                    "Found base_dir '{base_dir}' set in filesystem storage config, which was not "
                    "the tmp_dir we expected ('{tmp_dir}', mounting host_tmp_dir "
                    "'{host_tmp_dir}' from the host). We assume you know what you are doing, but "
                    "if you are having trouble executing containerized workloads, this may be the "
                    "issue".format(
                        base_dir=run_config["storage"]["filesystem"]["config"]
                        ["base_dir"],
                        tmp_dir=tmp_dir,
                        host_tmp_dir=host_tmp_dir,
                    ))
            else:
                run_config["storage"]["filesystem"] = dict(
                    run_config["storage"]["filesystem"] or {}, **{
                        "config":
                        dict(((run_config["storage"].get("filesystem", {})
                               or {}).get("config", {}) or {}),
                             **{"base_dir": tmp_dir})
                    })

        self.docker_conn_id_set = kwargs.get("docker_conn_id") is not None
        self.run_config = run_config
        self.pipeline_name = dagster_operator_parameters.pipeline_name
        self.pipeline_snapshot = dagster_operator_parameters.pipeline_snapshot
        self.execution_plan_snapshot = dagster_operator_parameters.execution_plan_snapshot
        self.parent_pipeline_snapshot = dagster_operator_parameters.parent_pipeline_snapshot
        self.mode = dagster_operator_parameters.mode
        self.step_keys = dagster_operator_parameters.step_keys
        self.recon_repo = dagster_operator_parameters.recon_repo
        self._run_id = None

        self.instance_ref = dagster_operator_parameters.instance_ref
        check.invariant(self.instance_ref)
        self.instance = DagsterInstance.from_ref(self.instance_ref)

        # These shenanigans are so we can override DockerOperator.get_hook in order to configure
        # a docker client using docker.from_env, rather than messing with the logic of
        # DockerOperator.execute
        if not self.docker_conn_id_set:
            try:
                from_env().version()
            except Exception:  # pylint: disable=broad-except
                pass
            else:
                kwargs["docker_conn_id"] = True

        if "environment" not in kwargs:
            kwargs["environment"] = get_aws_environment()

        super(DagsterDockerOperator, self).__init__(
            task_id=dagster_operator_parameters.task_id,
            dag=dagster_operator_parameters.dag,
            tmp_dir=tmp_dir,
            host_tmp_dir=host_tmp_dir,
            xcom_push=True,
            # We do this because log lines won't necessarily be emitted in order (!) -- so we can't
            # just check the last log line to see if it's JSON.
            xcom_all=True,
            *args,
            **kwargs)
예제 #20
0
'''This file is used by the EphemeralGrpcRunLauncher to execute runs in a subprocess.'''
import sys

from dagster import check
from dagster.api.execute_run import cli_api_execute_run_grpc
from dagster.core.instance import DagsterInstance, InstanceRef
from dagster.core.origin import PipelinePythonOrigin
from dagster.serdes import deserialize_json_to_dagster_namedtuple
from dagster.serdes.ipc import setup_interrupt_support
from dagster.seven import json

if __name__ == '__main__':
    setup_interrupt_support()
    kwargs = json.loads(sys.argv[1])
    instance_ref = check.inst(
        deserialize_json_to_dagster_namedtuple(kwargs['instance_ref']),
        InstanceRef)
    pipeline_origin = check.inst(
        deserialize_json_to_dagster_namedtuple(kwargs['pipeline_origin']),
        PipelinePythonOrigin)
    pipeline_run_id = kwargs['pipeline_run_id']
    instance = DagsterInstance.from_ref(instance_ref)
    pipeline_run = instance.get_run_by_id(pipeline_run_id)
    events = [
        evt
        for evt in cli_api_execute_run_grpc(instance_ref=instance_ref,
                                            pipeline_origin=pipeline_origin,
                                            pipeline_run=pipeline_run)
    ]
    print(len(events))  # pylint: disable=print-call
예제 #21
0
파일: impl.py 프로젝트: M-EZZ/dagster
def get_external_schedule_execution(
    recon_repo,
    instance_ref,
    schedule_name,
    schedule_execution_data_mode,
    scheduled_execution_timestamp,
    scheduled_execution_timezone,
):
    check.inst_param(
        recon_repo,
        "recon_repo",
        ReconstructableRepository,
    )
    definition = recon_repo.get_definition()
    schedule_def = definition.get_schedule_def(schedule_name)
    with DagsterInstance.from_ref(instance_ref) as instance:

        scheduled_execution_time = (pendulum.from_timestamp(
            scheduled_execution_timestamp,
            tz=scheduled_execution_timezone,
        ) if scheduled_execution_timestamp else None)

        schedule_context = ScheduleExecutionContext(instance,
                                                    scheduled_execution_time)

        try:
            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    "Error occurred during the execution of should_execute for schedule "
                    "{schedule_name}".format(schedule_name=schedule_def.name),
            ):
                should_execute = None
                if (schedule_execution_data_mode ==
                        ScheduleExecutionDataMode.LAUNCH_SCHEDULED_EXECUTION):
                    should_execute = schedule_def.should_execute(
                        schedule_context)
                    if not should_execute:
                        return ExternalScheduleExecutionData(
                            should_execute=False, run_config=None, tags=None)

            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    "Error occurred during the execution of run_config_fn for schedule "
                    "{schedule_name}".format(schedule_name=schedule_def.name),
            ):
                run_config = schedule_def.get_run_config(schedule_context)

            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    "Error occurred during the execution of tags_fn for schedule "
                    "{schedule_name}".format(schedule_name=schedule_def.name),
            ):
                tags = schedule_def.get_tags(schedule_context)

            return ExternalScheduleExecutionData(run_config=run_config,
                                                 tags=tags,
                                                 should_execute=should_execute)
        except ScheduleExecutionError:
            return ExternalScheduleExecutionErrorData(
                serializable_error_info_from_exc_info(sys.exc_info()))
예제 #22
0
 def instance(self):
     if not self._instance:
         self._instance = self._exit_stack.enter_context(
             DagsterInstance.from_ref(self._instance_ref)
         )
     return self._instance
예제 #23
0
파일: manager.py 프로젝트: xhochy/dagster
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle_kwargs=None,
        pipeline_run_dict=None,
        solid_subset=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(handle_kwargs, 'handle_kwargs')
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        try:
            handle = load_handle.handle_for_pipeline_cli_args(
                handle_kwargs, use_default_repository_yaml=False)
        except (check.CheckError, load_handle.UsageError) as err:
            six.raise_from(
                DagstermillError(
                    'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded '
                    'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, '
                    'through dagster-graphql, or in-memory after loading it through an '
                    'ExecutionTargetHandle.'),
                err,
            )

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(
                handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle)

        pipeline_run = unpack_value(pipeline_run_dict)

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
                self.pipeline_def,
                environment_dict,
                pipeline_run,
                instance=instance,
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context)

        return self.context
예제 #24
0
def test_0_12_0_extract_asset_index_cols():
    src_dir = file_relative_path(__file__, "snapshot_0_12_0_pre_asset_index_cols/sqlite")

    @solid
    def asset_solid(_):
        yield AssetMaterialization(
            asset_key=AssetKey(["a"]), partition="partition_1", tags={"foo": "FOO"}
        )
        yield AssetMaterialization(asset_key=AssetKey(["b"]), tags={"bar": "BAR"})
        yield Output(1)

    @pipeline
    def asset_pipeline():
        asset_solid()

    with copy_directory(src_dir) as test_dir:
        db_path = os.path.join(test_dir, "history", "runs", "index.db")
        assert get_current_alembic_version(db_path) == "3b529ad30626"
        assert "last_materialization_timestamp" not in set(
            get_sqlite3_columns(db_path, "asset_keys")
        )
        assert "wipe_timestamp" not in set(get_sqlite3_columns(db_path, "asset_keys"))
        assert "tags" not in set(get_sqlite3_columns(db_path, "asset_keys"))
        with DagsterInstance.from_ref(InstanceRef.from_dir(test_dir)) as instance:
            storage = instance._event_storage

            # make sure that executing the pipeline works
            execute_pipeline(asset_pipeline, instance=instance)
            assert storage.has_asset_key(AssetKey(["a"]))
            assert storage.has_asset_key(AssetKey(["b"]))

            # make sure that wiping works
            storage.wipe_asset(AssetKey(["a"]))
            assert not storage.has_asset_key(AssetKey(["a"]))
            assert storage.has_asset_key(AssetKey(["b"]))

            execute_pipeline(asset_pipeline, instance=instance)
            assert storage.has_asset_key(AssetKey(["a"]))

            # wipe and leave asset wiped
            storage.wipe_asset(AssetKey(["b"]))
            assert not storage.has_asset_key(AssetKey(["b"]))

            old_keys = storage.all_asset_keys()

            instance.upgrade()

            assert "last_materialization_timestamp" in set(
                get_sqlite3_columns(db_path, "asset_keys")
            )
            assert "wipe_timestamp" in set(get_sqlite3_columns(db_path, "asset_keys"))
            assert "tags" in set(get_sqlite3_columns(db_path, "asset_keys"))

            assert storage.has_asset_key(AssetKey(["a"]))
            assert not storage.has_asset_key(AssetKey(["b"]))

            new_keys = storage.all_asset_keys()
            assert set(old_keys) == set(new_keys)

            # make sure that storing assets still works
            execute_pipeline(asset_pipeline, instance=instance)

            # make sure that wiping still works
            storage.wipe_asset(AssetKey(["a"]))
            assert not storage.has_asset_key(AssetKey(["a"]))
예제 #25
0
def _run_in_subprocess(
    serialized_execute_run_args,
    recon_pipeline,
    termination_event,
    subprocess_status_handler,
    run_event_handler,
):

    start_termination_thread(termination_event)
    try:
        execute_run_args = deserialize_json_to_dagster_namedtuple(
            serialized_execute_run_args)
        check.inst_param(execute_run_args, 'execute_run_args', ExecuteRunArgs)

        instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)

        pid = os.getpid()

    except:  # pylint: disable=bare-except
        event = IPCErrorMessage(
            serializable_error_info=serializable_error_info_from_exc_info(
                sys.exc_info()),
            message='Error during RPC setup for ExecuteRun',
        )
        subprocess_status_handler(event)
        subprocess_status_handler(RunInSubprocessComplete())
        return

    subprocess_status_handler(StartRunInSubprocessSuccessful())

    run_event_handler(
        instance.report_engine_event(
            'Started process for pipeline (pid: {pid}).'.format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid,
                                       marker_end='cli_api_subprocess_init'),
        ))

    # This is so nasty but seemingly unavoidable
    # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/
    closed = False
    try:
        for event in _core_execute_run(recon_pipeline, pipeline_run, instance):
            run_event_handler(event)
    except KeyboardInterrupt:
        run_event_handler(
            instance.report_engine_event(
                message='Pipeline execution terminated by interrupt',
                pipeline_run=pipeline_run,
            ))
        raise
    except GeneratorExit:
        closed = True
        raise
    finally:
        if not closed:
            run_event_handler(
                instance.report_engine_event(
                    'Process for pipeline exited (pid: {pid}).'.format(
                        pid=pid),
                    pipeline_run,
                ))
        subprocess_status_handler(RunInSubprocessComplete())
예제 #26
0
    def __init__(self,
                 task_id,
                 environment_dict=None,
                 pipeline_name=None,
                 mode=None,
                 step_keys=None,
                 dag=None,
                 instance_ref=None,
                 *args,
                 **kwargs):
        check.str_param(pipeline_name, 'pipeline_name')
        step_keys = check.opt_list_param(step_keys, 'step_keys', of_type=str)
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)
        check.opt_inst_param(instance_ref, 'instance_ref', InstanceRef)

        kwargs['name'] = 'dagster.{pipeline_name}.{task_id}'.format(
            pipeline_name=pipeline_name, task_id=task_id).replace(
                '_',
                '-'  # underscores are not permissible DNS names
            )

        if 'storage' not in environment_dict:
            raise AirflowException(
                'No storage config found -- must configure either filesystem or s3 storage for '
                'the DagsterKubernetesPodOperator. Ex.: \n'
                'storage:\n'
                '  filesystem:\n'
                '    base_dir: \'/some/shared/volume/mount/special_place\''
                '\n\n --or--\n\n'
                'storage:\n'
                '  s3:\n'
                '    s3_bucket: \'my-s3-bucket\'\n')

        check.invariant(
            'in_memory' not in environment_dict.get('storage', {}),
            'Cannot use in-memory storage with Airflow, must use S3',
        )

        self.environment_dict = environment_dict
        self.pipeline_name = pipeline_name
        self.mode = mode
        self.step_keys = step_keys
        self._run_id = None
        # self.instance might be None in, for instance, a unit test setting where the operator
        # was being directly instantiated without passing through make_airflow_dag
        self.instance = DagsterInstance.from_ref(
            instance_ref) if instance_ref else None

        # Store Airflow DAG run timestamp so that we can pass along via execution metadata
        self.airflow_ts = kwargs.get('ts')

        # Add AWS creds
        self.env_vars = kwargs.get('env_vars', {})
        for k, v in get_aws_environment().items():
            self.env_vars.setdefault(k, v)

        kwargs.setdefault('labels', {})
        kwargs['labels'].setdefault('dagster_pipeline', self.pipeline_name)
        kwargs['labels'].setdefault('app.kubernetes.io/name', 'dagster')
        kwargs['labels'].setdefault('app.kubernetes.io/instance',
                                    self.pipeline_name)
        kwargs['labels'].setdefault('app.kubernetes.io/version',
                                    dagster_version)
        kwargs['labels'].setdefault('app.kubernetes.io/component',
                                    'pipeline-execution')
        kwargs['labels'].setdefault('app.kubernetes.io/part-of',
                                    'dagster-airflow')
        kwargs['labels'].setdefault('app.kubernetes.io/managed-by',
                                    'dagster-airflow')

        # The xcom mechanism for the pod operator is very unlike that of the Docker operator, so
        # we disable it
        if 'xcom_push' in kwargs:
            self.log.warning(
                'xcom_push cannot be enabled with the DagsterKubernetesPodOperator, disabling'
            )
        kwargs['xcom_push'] = False

        super(DagsterKubernetesPodOperator, self).__init__(task_id=task_id,
                                                           dag=dag,
                                                           *args,
                                                           **kwargs)
예제 #27
0
def _execute_run(request):
    try:
        execute_run_args = deserialize_json_to_dagster_namedtuple(
            request.serialized_execute_run_args)
        check.inst_param(execute_run_args, 'execute_run_args', ExecuteRunArgs)

        recon_pipeline = recon_pipeline_from_origin(
            execute_run_args.pipeline_origin)

        instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)

        pid = os.getpid()

    except:  # pylint: disable=bare-except
        yield IPCErrorMessage(
            serializable_error_info=serializable_error_info_from_exc_info(
                sys.exc_info()),
            message='Error during RPC setup for ExecuteRun',
        )
        return

    yield instance.report_engine_event(
        'Started process for pipeline (pid: {pid}).'.format(pid=pid),
        pipeline_run,
        EngineEventData.in_process(pid, marker_end='cli_api_subprocess_init'),
    )

    # This is so nasty but seemingly unavoidable
    # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/
    closed = False
    try:
        for event in execute_run_iterator(recon_pipeline, pipeline_run,
                                          instance):
            yield event
    except DagsterSubprocessError as err:
        if not all([
                err_info.cls_name == 'KeyboardInterrupt'
                for err_info in err.subprocess_error_infos
        ]):
            yield instance.report_engine_event(
                'An exception was thrown during execution that is likely a framework error, '
                'rather than an error in user code.',
                pipeline_run,
                EngineEventData.engine_error(
                    serializable_error_info_from_exc_info(sys.exc_info())),
            )
            instance.report_run_failed(pipeline_run)
    except GeneratorExit:
        closed = True
        raise
    except Exception:  # pylint: disable=broad-except
        yield instance.report_engine_event(
            'An exception was thrown during execution that is likely a framework error, '
            'rather than an error in user code.',
            pipeline_run,
            EngineEventData.engine_error(
                serializable_error_info_from_exc_info(sys.exc_info())),
        )
        instance.report_run_failed(pipeline_run)
    finally:
        if not closed:
            yield instance.report_engine_event(
                'Process for pipeline exited (pid: {pid}).'.format(pid=pid),
                pipeline_run,
            )
예제 #28
0
    def __init__(self, dagster_operator_parameters, *args):
        kwargs = dagster_operator_parameters.op_kwargs
        tmp_dir = kwargs.pop('tmp_dir', DOCKER_TEMPDIR)
        host_tmp_dir = kwargs.pop('host_tmp_dir',
                                  seven.get_system_temp_directory())

        environment_dict = dagster_operator_parameters.environment_dict
        if 'filesystem' in environment_dict['storage']:
            if ('config' in (environment_dict['storage'].get('filesystem', {})
                             or {})
                    and 'base_dir' in ((environment_dict['storage'].get(
                        'filesystem', {}) or {}).get('config', {}) or {})
                    and environment_dict['storage']['filesystem']['config']
                ['base_dir'] != tmp_dir):
                warnings.warn(
                    'Found base_dir \'{base_dir}\' set in filesystem storage config, which was not '
                    'the tmp_dir we expected (\'{tmp_dir}\', mounting host_tmp_dir '
                    '\'{host_tmp_dir}\' from the host). We assume you know what you are doing, but '
                    'if you are having trouble executing containerized workloads, this may be the '
                    'issue'.format(
                        base_dir=environment_dict['storage']['filesystem']
                        ['config']['base_dir'],
                        tmp_dir=tmp_dir,
                        host_tmp_dir=host_tmp_dir,
                    ))
            else:
                environment_dict['storage']['filesystem'] = dict(
                    environment_dict['storage']['filesystem'] or {}, **{
                        'config':
                        dict(((environment_dict['storage'].get(
                            'filesystem', {}) or {}).get('config', {}) or {}),
                             **{'base_dir': tmp_dir})
                    })

        self.docker_conn_id_set = kwargs.get('docker_conn_id') is not None
        self.environment_dict = environment_dict
        self.pipeline_name = dagster_operator_parameters.pipeline_name
        self.pipeline_snapshot = dagster_operator_parameters.pipeline_snapshot
        self.execution_plan_snapshot = dagster_operator_parameters.execution_plan_snapshot
        self.parent_pipeline_snapshot = dagster_operator_parameters.parent_pipeline_snapshot
        self.mode = dagster_operator_parameters.mode
        self.step_keys = dagster_operator_parameters.step_keys
        self._run_id = None
        # self.instance might be None in, for instance, a unit test setting where the operator
        # was being directly instantiated without passing through make_airflow_dag
        self.instance = (DagsterInstance.from_ref(
            dagster_operator_parameters.instance_ref)
                         if dagster_operator_parameters.instance_ref else None)

        # These shenanigans are so we can override DockerOperator.get_hook in order to configure
        # a docker client using docker.from_env, rather than messing with the logic of
        # DockerOperator.execute
        if not self.docker_conn_id_set:
            try:
                from_env().version()
            except Exception:  # pylint: disable=broad-except
                pass
            else:
                kwargs['docker_conn_id'] = True

        # We do this because log lines won't necessarily be emitted in order (!) -- so we can't
        # just check the last log line to see if it's JSON.
        kwargs['xcom_all'] = True

        # Store Airflow DAG run timestamp so that we can pass along via execution metadata
        self.airflow_ts = kwargs.get('ts')

        if 'environment' not in kwargs:
            kwargs['environment'] = get_aws_environment()

        super(DagsterDockerOperator,
              self).__init__(task_id=dagster_operator_parameters.task_id,
                             dag=dagster_operator_parameters.dag,
                             tmp_dir=tmp_dir,
                             host_tmp_dir=host_tmp_dir,
                             *args,
                             **kwargs)
예제 #29
0
 def get_instance(self):
     with DagsterInstance.from_ref(self._instance_ref) as instance:
         yield instance
예제 #30
0
파일: server.py 프로젝트: cadet702/dagster
    def ExecuteRun(self, request, _context):
        try:
            execute_run_args = deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args)
            check.inst_param(execute_run_args, "execute_run_args",
                             ExecuteRunArgs)

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            yield api_pb2.ExecuteRunEvent(
                serialized_dagster_event_or_ipc_error_message=
                serialize_dagster_namedtuple(
                    IPCErrorMessage(
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                        message="Error during RPC setup for ExecuteRun",
                    )))
            return

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=execute_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )
        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                DagsterInstance.from_ref(execute_run_args.instance_ref),
            )
            self._termination_events[run_id] = termination_event

        done = False
        while not done:
            try:
                # We use `get_nowait()` instead of `get()` so that we can handle the case where the
                # execution process has died unexpectedly -- `get()` would hang forever in that case
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            IPCErrorMessage(
                                serializable_error_info=
                                serializable_error_info_from_exc_info(
                                    sys.exc_info()),
                                message=
                                ("GRPC server: Subprocess for {run_id} terminated unexpectedly"
                                 ).format(run_id=run_id),
                            )))
                    done = True
                time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              RunInSubprocessComplete):
                    done = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                StartRunInSubprocessSuccessful):
                    continue
                else:
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            dagster_event_or_ipc_error_message_or_done))

        with self._execution_lock:
            if run_id in self._executions:
                del self._executions[run_id]
            if run_id in self._termination_events:
                del self._termination_events[run_id]