示例#1
0
文件: __init__.py 项目: cy56/dagster
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
        check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute,
                            'solids_to_execute',
                            of_type=str)
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} '
                    'that conflicts with solids_to_execute arg {solids_to_execute}'
                    .format(
                        pipeline_solids_to_execute=str_format_list(
                            pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute)

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                environment_dict=environment_dict,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=check.opt_str_param(
                mode, 'mode', default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
示例#2
0
 def has_enum_value(self, value):
     check.invariant(self.kind == ConfigTypeKind.ENUM)
     for enum_value in self.enum_values:
         if enum_value.value == value:
             return True
     return False
示例#3
0
 def __init__(self, event):
     super().__init__()
     self._event = check.inst_param(event, "event", EventLogEntry)
     check.invariant(
         isinstance(event.dagster_event.step_materialization_data, StepMaterializationData)
     )
示例#4
0
 def on_modified(self, event):
     check.invariant(event.src_path == self._log_path)
     self._process_log()
示例#5
0
 def cp_object(self, src, dst):
     check.invariant(not dst in self.values, "key {} already in use".format(dst))
     check.invariant(src in self.values, "key {} not present".format(src))
     self.values[dst] = self.values[src]
     return src, dst
示例#6
0
def reexecute_pipeline_iterator(
    pipeline: Union[IPipeline, PipelineDefinition],
    parent_run_id: str,
    run_config: Optional[dict] = None,
    step_selection: Optional[List[str]] = None,
    mode: Optional[str] = None,
    preset: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
    instance: DagsterInstance = None,
) -> Iterator[DagsterEvent]:
    """Reexecute a pipeline iteratively.

    Rather than package up the result of running a pipeline into a single object, like
    :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline
    reexecution.

    This is intended to allow the caller to handle these events on a streaming basis in whatever
    way is appropriate.

    Parameters:
        pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.
        parent_run_id (str): The id of the previous run to reexecute. The run must exist in the
            instance.
        run_config (Optional[dict]): The environment configuration that parametrizes this run,
            as a dict.
        solid_selection (Optional[List[str]]): A list of solid selection queries (including single
            solid names) to execute. For example:

            - ``['some_solid']``: selects ``some_solid`` itself.
            - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).
            - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants
              (downstream dependencies) within 3 levels down.
            - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its
              ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.

        mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``
            and ``preset``.
        preset (Optional[str]): The name of the pipeline preset to use. You may not set both
            ``mode`` and ``preset``.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.

    Returns:
      Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution.
    """

    check.opt_list_param(step_selection, "step_selection", of_type=str)

    check.str_param(parent_run_id, "parent_run_id")

    with ephemeral_instance_if_missing(instance) as execute_instance:
        (pipeline, run_config, mode, tags, _,
         _) = _check_execute_pipeline_args(
             pipeline=pipeline,
             run_config=run_config,
             mode=mode,
             preset=preset,
             tags=tags,
             solid_selection=None,
         )
        parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)
        check.invariant(
            parent_pipeline_run,
            "No parent run with id {parent_run_id} found in instance.".format(
                parent_run_id=parent_run_id),
        )

        step_keys_to_execute: Optional[List[str]] = None
        execution_plan: Optional[ExecutionPlan] = None
        # resolve step selection DSL queries using parent execution information
        if step_selection:
            step_keys_to_execute, execution_plan = _resolve_reexecute_step_selection(
                execute_instance,
                pipeline,
                mode,
                run_config,
                parent_pipeline_run,
                step_selection,
            )

        pipeline_run = execute_instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            run_config=run_config,
            execution_plan=execution_plan,
            mode=mode,
            tags=tags,
            solid_selection=parent_pipeline_run.solid_selection,
            solids_to_execute=parent_pipeline_run.solids_to_execute,
            # convert to frozenset https://github.com/dagster-io/dagster/issues/2914
            step_keys_to_execute=list(step_keys_to_execute)
            if step_keys_to_execute else None,
            root_run_id=parent_pipeline_run.root_run_id
            or parent_pipeline_run.run_id,
            parent_run_id=parent_pipeline_run.run_id,
        )

        return execute_run_iterator(pipeline, pipeline_run, execute_instance)
示例#7
0
        def _wrapped_fn(context: SensorEvaluationContext):
            # initiate the cursor to (most recent event id, current timestamp) when:
            # * it's the first time starting the sensor
            # * or, the cursor isn't in valid format (backcompt)
            if context.cursor is None or not RunStatusSensorCursor.is_valid(
                    context.cursor):
                most_recent_event_records = list(
                    context.instance.get_event_records(ascending=False,
                                                       limit=1))
                most_recent_event_id = (most_recent_event_records[0].storage_id
                                        if len(most_recent_event_records) == 1
                                        else -1)

                new_cursor = RunStatusSensorCursor(
                    update_timestamp=pendulum.now("UTC").isoformat(),
                    record_id=most_recent_event_id,
                )
                context.update_cursor(new_cursor.to_json())
                yield SkipReason(
                    f"Initiating {name}. Set cursor to {new_cursor}")
                return

            record_id, update_timestamp = RunStatusSensorCursor.from_json(
                context.cursor)

            # Fetch events after the cursor id
            # * we move the cursor forward to the latest visited event's id to avoid revisits
            # * when the daemon is down, bc we persist the cursor info, we can go back to where we
            #   left and backfill alerts for the qualified events (up to 5 at a time) during the downtime
            # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.
            event_records = context.instance.get_event_records(
                EventRecordsFilter(
                    after_cursor=RunShardedEventsCursor(
                        id=record_id,
                        run_updated_after=pendulum.parse(update_timestamp)),
                    event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[
                        pipeline_run_status],
                ),
                ascending=True,
                limit=5,
            )

            for event_record in event_records:
                event_log_entry = event_record.event_log_entry
                storage_id = event_record.storage_id

                # get run info
                run_records = context.instance.get_run_records(
                    filters=PipelineRunsFilter(
                        run_ids=[event_log_entry.run_id]))
                check.invariant(len(run_records) == 1)
                pipeline_run = run_records[0].pipeline_run
                update_timestamp = run_records[0].update_timestamp

                # skip if any of of the followings happens:
                if (
                        # the pipeline does not have a repository (manually executed)
                        not pipeline_run.external_pipeline_origin or
                        # the pipeline does not belong to the current repository
                        pipeline_run.external_pipeline_origin.
                        external_repository_origin.repository_name !=
                        context.repository_name or
                        # if pipeline is not selected
                    (pipeline_selection
                     and pipeline_run.pipeline_name not in pipeline_selection
                     )):
                    context.update_cursor(
                        RunStatusSensorCursor(
                            record_id=storage_id,
                            update_timestamp=update_timestamp.isoformat()).
                        to_json())
                    continue

                serializable_error = None

                try:
                    with user_code_error_boundary(
                            RunStatusSensorExecutionError,
                            lambda:
                            f'Error occurred during the execution sensor "{name}".',
                    ):
                        # one user code invocation maps to one failure event
                        run_status_sensor_fn(
                            RunStatusSensorContext(
                                sensor_name=name,
                                pipeline_run=pipeline_run,
                                dagster_event=event_log_entry.dagster_event,
                            ))
                except RunStatusSensorExecutionError as run_status_sensor_execution_error:
                    # When the user code errors, we report error to the sensor tick not the original run.
                    serializable_error = serializable_error_info_from_exc_info(
                        run_status_sensor_execution_error.original_exc_info)

                context.update_cursor(
                    RunStatusSensorCursor(record_id=storage_id,
                                          update_timestamp=update_timestamp.
                                          isoformat()).to_json())

                # Yield PipelineRunReaction to indicate the execution success/failure.
                # The sensor machinery would
                # * report back to the original run if success
                # * update cursor and job state
                yield PipelineRunReaction(
                    pipeline_run=pipeline_run,
                    error=serializable_error,
                )
示例#8
0
 def __init__(self, bucket, client=None, prefix="dagster"):
     self.bucket = check.str_param(bucket, "bucket")
     self.client = client or storage.Client()
     self.bucket_obj = self.client.get_bucket(bucket)
     check.invariant(self.bucket_obj.exists())
     self.prefix = check.str_param(prefix, "prefix")
示例#9
0
    def __init__(
        self,
        name: str,
        pipeline_name: Optional[str] = None,
        partition_fn: Optional[Callable[..., Union[List[Partition[T]],
                                                   List[str]]]] = None,
        solid_selection: Optional[List[str]] = None,
        mode: Optional[str] = None,
        run_config_fn_for_partition: Callable[[Partition[T]],
                                              Any] = lambda _partition: {},
        tags_fn_for_partition: Callable[[Partition[T]], Optional[Dict[
            str, str]]] = lambda _partition: {},
        partitions_def: Optional[PartitionsDefinition[T]  # pylint: disable=unsubscriptable-object
                                 ] = None,
        job_name: Optional[str] = None,
    ):
        check.invariant(
            partition_fn is not None or partitions_def is not None,
            "One of `partition_fn` or `partitions_def` must be supplied.",
        )
        check.invariant(
            not (partition_fn and partitions_def),
            "Only one of `partition_fn` or `partitions_def` must be supplied.",
        )
        check.invariant(
            not (pipeline_name and job_name),
            "Only one of `job_name` and `pipeline_name` must be supplied.",
        )

        _wrap_partition_fn = None

        if partition_fn is not None:
            partition_fn_param_count = len(
                inspect.signature(partition_fn).parameters)

            def _wrap_partition(x: Union[str, Partition]) -> Partition:
                if isinstance(x, Partition):
                    return x
                if isinstance(x, str):
                    return Partition(x)
                raise DagsterInvalidDefinitionError(
                    "Expected <Partition> | <str>, received {type}".format(
                        type=type(x)))

            def _wrap_partition_fn(current_time=None) -> List[Partition]:
                if not current_time:
                    current_time = pendulum.now("UTC")

                check.callable_param(partition_fn, "partition_fn")

                if partition_fn_param_count == 1:
                    obj_list = cast(
                        Callable[..., List[Union[Partition[T], str]]],
                        partition_fn,
                    )(current_time)
                else:
                    obj_list = partition_fn()  # type: ignore

                return [_wrap_partition(obj) for obj in obj_list]

        self._name = check_valid_name(name)
        self._pipeline_name = check.opt_str_param(pipeline_name,
                                                  "pipeline_name")
        self._job_name = check.opt_str_param(job_name, "job_name")
        self._partition_fn = _wrap_partition_fn
        self._solid_selection = check.opt_nullable_list_param(
            solid_selection, "solid_selection", of_type=str)
        self._mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
        self._user_defined_run_config_fn_for_partition = check.callable_param(
            run_config_fn_for_partition, "run_config_fn_for_partition")
        self._user_defined_tags_fn_for_partition = check.callable_param(
            tags_fn_for_partition, "tags_fn_for_partition")
        check.opt_inst_param(partitions_def, "partitions_def",
                             PartitionsDefinition)
        if partitions_def is not None:
            self._partitions_def = partitions_def
        else:
            if partition_fn is None:
                check.failed(
                    "One of `partition_fn` or `partitions_def` must be supplied."
                )
            self._partitions_def = DynamicPartitionsDefinition(
                partition_fn=_wrap_partition_fn)
示例#10
0
文件: launcher.py 项目: keyz/dagster
    def __init__(
        self,
        service_account_name,
        instance_config_map,
        postgres_password_secret=None,
        dagster_home=None,
        job_image=None,
        image_pull_policy=None,
        image_pull_secrets=None,
        load_incluster_config=True,
        kubeconfig_file=None,
        inst_data=None,
        job_namespace="default",
        env_config_maps=None,
        env_secrets=None,
        env_vars=None,
        k8s_client_batch_api=None,
        volume_mounts=None,
        volumes=None,
        labels=None,
        fail_pod_on_run_failure=None,
    ):
        self._inst_data = check.opt_inst_param(inst_data, "inst_data",
                                               ConfigurableClassData)
        self.job_namespace = check.str_param(job_namespace, "job_namespace")

        self.load_incluster_config = load_incluster_config
        self.kubeconfig_file = kubeconfig_file
        if load_incluster_config:
            check.invariant(
                kubeconfig_file is None,
                "`kubeconfig_file` is set but `load_incluster_config` is True.",
            )
            kubernetes.config.load_incluster_config()
        else:
            check.opt_str_param(kubeconfig_file, "kubeconfig_file")
            kubernetes.config.load_kube_config(kubeconfig_file)

        self._fixed_batch_api = k8s_client_batch_api

        self._job_config = None
        self._job_image = check.opt_str_param(job_image, "job_image")
        self.dagster_home = check.str_param(dagster_home, "dagster_home")
        self._image_pull_policy = check.opt_str_param(image_pull_policy,
                                                      "image_pull_policy",
                                                      "IfNotPresent")
        self._image_pull_secrets = check.opt_list_param(image_pull_secrets,
                                                        "image_pull_secrets",
                                                        of_type=dict)
        self._service_account_name = check.str_param(service_account_name,
                                                     "service_account_name")
        self.instance_config_map = check.str_param(instance_config_map,
                                                   "instance_config_map")
        self.postgres_password_secret = check.opt_str_param(
            postgres_password_secret, "postgres_password_secret")
        self._env_config_maps = check.opt_list_param(env_config_maps,
                                                     "env_config_maps",
                                                     of_type=str)
        self._env_secrets = check.opt_list_param(env_secrets,
                                                 "env_secrets",
                                                 of_type=str)
        self._env_vars = check.opt_list_param(env_vars,
                                              "env_vars",
                                              of_type=str)
        self._volume_mounts = check.opt_list_param(volume_mounts,
                                                   "volume_mounts")
        self._volumes = check.opt_list_param(volumes, "volumes")
        self._labels = check.opt_dict_param(labels,
                                            "labels",
                                            key_type=str,
                                            value_type=str)
        self._fail_pod_on_run_failure = check.opt_bool_param(
            fail_pod_on_run_failure, "fail_pod_on_run_failure")

        super().__init__()
示例#11
0
def create_and_launch_partition_backfill(graphene_info, backfill_params):
    from ...schema.backfill import GraphenePartitionBackfillSuccess
    from ...schema.errors import GraphenePartitionSetNotFoundError, GraphenePythonError

    partition_set_selector = backfill_params.get("selector")
    partition_set_name = partition_set_selector.get("partitionSetName")
    repository_selector = RepositorySelector.from_graphql_input(
        partition_set_selector.get("repositorySelector")
    )
    location = graphene_info.context.get_repository_location(repository_selector.location_name)
    repository = location.get_repository(repository_selector.repository_name)
    matches = [
        partition_set
        for partition_set in repository.get_external_partition_sets()
        if partition_set.name == partition_set_selector.get("partitionSetName")
    ]
    if not matches:
        return GraphenePartitionSetNotFoundError(partition_set_name)

    check.invariant(
        len(matches) == 1,
        "Partition set names must be unique: found {num} matches for {partition_set_name}".format(
            num=len(matches), partition_set_name=partition_set_name
        ),
    )

    external_partition_set = next(iter(matches))
    external_pipeline = repository.get_full_external_pipeline(external_partition_set.pipeline_name)
    pipeline_selector = PipelineSelector(
        location_name=location.name,
        repository_name=repository.name,
        pipeline_name=external_pipeline.name,
        solid_selection=external_partition_set.solid_selection,
    )

    partition_names = backfill_params.get("partitionNames")

    backfill_id = make_new_backfill_id()
    result = graphene_info.context.get_external_partition_set_execution_param_data(
        repository.handle, partition_set_name, partition_names
    )

    if isinstance(result, ExternalPartitionExecutionErrorData):
        return GraphenePythonError(result.error)

    assert isinstance(result, ExternalPartitionSetExecutionParamData)

    launched_run_ids = []
    execution_param_list = _build_execution_param_list_for_backfill(
        graphene_info.context.instance,
        result.partition_data,
        backfill_id,
        backfill_params,
        pipeline_selector,
        external_partition_set,
    )

    for execution_params in execution_param_list:
        pipeline_run = create_valid_pipeline_run(graphene_info, external_pipeline, execution_params)
        graphene_info.context.instance.submit_run(pipeline_run.run_id, external_pipeline)
        launched_run_ids.append(pipeline_run.run_id)

    return GraphenePartitionBackfillSuccess(
        backfill_id=backfill_id, launched_run_ids=launched_run_ids
    )
示例#12
0
def do_composition(
    decorator_name,
    graph_name,
    fn,
    provided_input_defs,
    provided_output_defs,
    config_schema,
    config_fn,
    ignore_output_from_composition_fn,
):
    """
    This a function used by both @pipeline and @composite_solid to implement their composition
    function which is our DSL for constructing a dependency graph.

    Args:
        decorator_name (str): Name of the calling decorator. e.g. "@pipeline",
            "@composite_solid", "@graph"
        graph_name (str): User-defined name of the definition being constructed
        fn (Callable): The composition function to be called.
        provided_input_defs(List[InputDefinition]): List of input definitions
            explicitly provided to the decorator by the user.
        provided_output_defs(List[OutputDefinition]): List of output definitions
            explicitly provided to the decorator by the user.
        config_schema(Any): Config schema provided to decorator by user.
        config_fn(Callable): Config fn provided to decorator by user.
        ignore_output_from_composite_fn(Bool): Because of backwards compatibility
            issues, pipelines ignore the return value out of the mapping if
            the user has not explicitly provided the output definitions.
            This should be removed in 0.10.0.
    """

    actual_input_defs = (provided_input_defs if provided_input_defs is not None
                         else infer_input_definitions_for_graph(
                             decorator_name, graph_name, fn))

    actual_output_defs, outputs_are_explicit = ((
        provided_output_defs, True) if provided_output_defs is not None else (
            infer_output_definitions(decorator_name, graph_name, fn),
            has_explicit_return_type(fn),
        ))

    positional_inputs = validate_solid_fn(decorator_name,
                                          graph_name,
                                          fn,
                                          actual_input_defs,
                                          exclude_nothing=False)

    kwargs = {
        input_def.name: InputMappingNode(input_def)
        for input_def in actual_input_defs
    }

    output = None
    returned_mapping = None
    enter_composition(graph_name, decorator_name)
    try:
        output = fn(**kwargs)
        if ignore_output_from_composition_fn:
            if output is not None:
                warnings.warn(
                    "You have returned a value out of a @pipeline-decorated function. "
                    "This currently has no effect on behavior, but will after 0.10.0 is "
                    "released. In order to preserve existing behavior to do not return "
                    "anything out of this function. Pipelines (and its successor, graphs) "
                    "will have meaningful outputs just like composite solids do today, "
                    "and the return value will be meaningful.",
                    stacklevel=3,
                )
            output = None

        returned_mapping = composite_mapping_from_output(
            output, actual_output_defs, graph_name)
    finally:
        context = exit_composition(returned_mapping)

    check.invariant(
        context.name == graph_name,
        "Composition context stack desync: received context for "
        '"{context.name}" expected "{graph_name}"'.format(
            context=context, graph_name=graph_name),
    )

    # line up mappings in definition order
    input_mappings = []
    for defn in actual_input_defs:
        mappings = [
            mapping for mapping in context.input_mappings
            if mapping.definition.name == defn.name
        ]

        if len(mappings) == 0:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{graph_name}' has unmapped input '{input_name}'. "
                "Remove it or pass it to the appropriate solid invocation.".
                format(decorator_name=decorator_name,
                       graph_name=graph_name,
                       input_name=defn.name))

        input_mappings += mappings

    output_mappings = []
    for defn in actual_output_defs:
        mapping = context.output_mapping_dict.get(defn.name)
        if mapping is None:
            # if we inferred output_defs we will be flexible and either take a mapping or not
            if not outputs_are_explicit:
                continue

            # if we are ignoring the output, disregard this unsatisfied mapping
            if ignore_output_from_composition_fn:
                continue

            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{graph_name}' has unmapped output '{output_name}'. "
                "Remove it or return a value from the appropriate solid invocation."
                .format(decorator_name=decorator_name,
                        graph_name=graph_name,
                        output_name=defn.name))
        output_mappings.append(mapping)

    config_mapping = _get_validated_config_mapping(graph_name, config_schema,
                                                   config_fn)

    return (
        input_mappings,
        output_mappings,
        context.dependencies,
        context.solid_defs,
        config_mapping,
        positional_inputs,
    )
示例#13
0
    def solid_def_named(self, name):
        check.str_param(name, 'name')

        check.invariant(name in self._all_solid_defs,
                        '{} not found'.format(name))
        return self._all_solid_defs[name]
示例#14
0
文件: __init__.py 项目: cy56/dagster
    def _construct_run_with_snapshots(
        self,
        pipeline_name,
        run_id,
        environment_dict,
        mode,
        solids_to_execute,
        step_keys_to_execute,
        status,
        tags,
        root_run_id,
        parent_run_id,
        pipeline_snapshot,
        execution_plan_snapshot,
        parent_pipeline_snapshot,
        solid_selection=None,
    ):

        # https://github.com/dagster-io/dagster/issues/2403
        if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:
            if AIRFLOW_EXECUTION_DATE_STR not in tags:
                tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc(
                ).isoformat()

        pipeline_run = PipelineRun(
            pipeline_name=pipeline_name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode,
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
        )

        if pipeline_snapshot is not None:
            from dagster.core.snap import create_pipeline_snapshot_id

            if pipeline_snapshot.lineage_snapshot:
                if not self._run_storage.has_pipeline_snapshot(
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id):
                    check.invariant(
                        create_pipeline_snapshot_id(
                            parent_pipeline_snapshot) ==
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id,
                        'Parent pipeline snapshot id out of sync with passed parent pipeline snapshot',
                    )

                    returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                        parent_pipeline_snapshot)
                    check.invariant(
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id
                        == returned_pipeline_snapshot_id)

            pipeline_snapshot_id = create_pipeline_snapshot_id(
                pipeline_snapshot)
            if not self._run_storage.has_pipeline_snapshot(
                    pipeline_snapshot_id):
                returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                    pipeline_snapshot)
                check.invariant(
                    pipeline_snapshot_id == returned_pipeline_snapshot_id)

            pipeline_run = pipeline_run.with_pipeline_snapshot_id(
                pipeline_snapshot_id)

        if execution_plan_snapshot is not None:
            from dagster.core.snap import create_execution_plan_snapshot_id

            check.invariant(execution_plan_snapshot.pipeline_snapshot_id ==
                            pipeline_snapshot_id)

            check.invariant(
                set(step_keys_to_execute) == set(
                    execution_plan_snapshot.step_keys_to_execute)
                if step_keys_to_execute else set(
                    execution_plan_snapshot.step_keys_to_execute) == set(
                        [step.key for step in execution_plan_snapshot.steps]),
                'We encode step_keys_to_execute twice in our stack, unfortunately. This check '
                'ensures that they are consistent. We check that step_keys_to_execute in the plan '
                'matches the step_keys_to_execute params if it is set. If it is not, this indicates '
                'a full execution plan, and so we verify that.',
            )

            execution_plan_snapshot_id = create_execution_plan_snapshot_id(
                execution_plan_snapshot)

            if not self._run_storage.has_execution_plan_snapshot(
                    execution_plan_snapshot_id):
                returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(
                    execution_plan_snapshot)

                check.invariant(execution_plan_snapshot_id ==
                                returned_execution_plan_snapshot_id)

            pipeline_run = pipeline_run.with_execution_plan_snapshot_id(
                execution_plan_snapshot_id)

        return pipeline_run
示例#15
0
def execute_run(
    pipeline: IPipeline,
    pipeline_run: PipelineRun,
    instance: DagsterInstance,
    raise_on_error: bool = False,
) -> PipelineExecutionResult:
    """Executes an existing pipeline run synchronously.

    Synchronous version of execute_run_iterator.

    Args:
        pipeline (IPipeline): The pipeline to execute.
        pipeline_run (PipelineRun): The run to execute
        instance (DagsterInstance): The instance in which the run has been created.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``False``.

    Returns:
        PipelineExecutionResult: The result of the execution.
    """
    if isinstance(pipeline, PipelineDefinition):
        raise DagsterInvariantViolationError(
            "execute_run requires an IPipeline but received a PipelineDefinition "
            "directly instead. To support hand-off to other processes provide a "
            "ReconstructablePipeline which can be done using reconstructable(). For in "
            "process only execution you can use InMemoryPipeline.")

    check.inst_param(pipeline, "pipeline", IPipeline)
    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)
    check.inst_param(instance, "instance", DagsterInstance)

    if pipeline_run.status == PipelineRunStatus.CANCELED:
        message = "Not starting execution since the run was canceled before execution could start"
        instance.report_engine_event(
            message,
            pipeline_run,
        )
        raise DagsterInvariantViolationError(message)

    check.invariant(
        pipeline_run.status == PipelineRunStatus.NOT_STARTED
        or pipeline_run.status == PipelineRunStatus.STARTING,
        desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING"
        .format(pipeline_run.pipeline_name, pipeline_run.run_id,
                pipeline_run.status),
    )
    pipeline_def = pipeline.get_definition()
    if pipeline_run.solids_to_execute:
        if isinstance(pipeline_def, PipelineSubsetDefinition):
            check.invariant(
                pipeline_run.solids_to_execute == pipeline.solids_to_execute,
                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that "
                "conflicts with pipeline subset {pipeline_solids_to_execute}.".
                format(
                    pipeline_solids_to_execute=str_format_set(
                        pipeline.solids_to_execute),
                    solids_to_execute=str_format_set(
                        pipeline_run.solids_to_execute),
                ),
            )
        else:
            # when `execute_run` is directly called, the sub pipeline hasn't been created
            # note that when we receive the solids to execute via PipelineRun, it won't support
            # solid selection query syntax
            pipeline = pipeline.subset_for_execution_from_existing_pipeline(
                pipeline_run.solids_to_execute)

    execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run,
                                                  instance)

    if is_memoized_run(pipeline_run.tags):
        environment_config = EnvironmentConfig.build(pipeline.get_definition(),
                                                     pipeline_run.run_config,
                                                     pipeline_run.mode)

        execution_plan = resolve_memoized_execution_plan(
            execution_plan,
            pipeline.get_definition(),
            pipeline_run.run_config,
            instance,
            environment_config,
        )

    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}

    _execute_run_iterable = ExecuteRunWithPlanIterable(
        execution_plan=execution_plan,
        iterator=pipeline_execution_iterator,
        execution_context_manager=PipelineExecutionContextManager(
            pipeline=pipeline,
            execution_plan=execution_plan,
            pipeline_run=pipeline_run,
            instance=instance,
            run_config=pipeline_run.run_config,
            raise_on_error=raise_on_error,
            output_capture=output_capture,
        ),
    )
    event_list = list(_execute_run_iterable)
    pipeline_context = _execute_run_iterable.pipeline_context

    return PipelineExecutionResult(
        pipeline.get_definition(),
        pipeline_run.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            execution_plan,
            pipeline,
            pipeline_run.run_config,
            pipeline_run,
            instance,
            intermediate_storage=pipeline_context.intermediate_storage,
        ),
        output_capture=output_capture,
    )
示例#16
0
def get_optional_inner_type(ttype):
    check.invariant(is_closed_python_optional_type(ttype),
                    'type must pass is_closed_python_optional_type check')

    return ttype.__args__[0]
示例#17
0
def reexecute_pipeline(
    pipeline: Union[IPipeline, PipelineDefinition],
    parent_run_id: str,
    run_config: Optional[dict] = None,
    step_selection: Optional[List[str]] = None,
    mode: Optional[str] = None,
    preset: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
    instance: DagsterInstance = None,
    raise_on_error: bool = True,
) -> PipelineExecutionResult:
    """Reexecute an existing pipeline run.

    Users will typically call this API when testing pipeline reexecution, or running standalone
    scripts.

    Parameters:
        pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.
        parent_run_id (str): The id of the previous run to reexecute. The run must exist in the
            instance.
        run_config (Optional[dict]): The environment configuration that parametrizes this run,
            as a dict.
        solid_selection (Optional[List[str]]): A list of solid selection queries (including single
            solid names) to execute. For example:

            - ``['some_solid']``: selects ``some_solid`` itself.
            - ``['*some_solid']``: select ``some_solid`` and all its ancestors (upstream dependencies).
            - ``['*some_solid+++']``: select ``some_solid``, all its ancestors, and its descendants
              (downstream dependencies) within 3 levels down.
            - ``['*some_solid', 'other_solid_a', 'other_solid_b+']``: select ``some_solid`` and all its
              ancestors, ``other_solid_a`` itself, and ``other_solid_b`` and its direct child solids.

        mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``
            and ``preset``.
        preset (Optional[str]): The name of the pipeline preset to use. You may not set both
            ``mode`` and ``preset``.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`.
    """

    check.opt_list_param(step_selection, "step_selection", of_type=str)

    check.str_param(parent_run_id, "parent_run_id")

    with ephemeral_instance_if_missing(instance) as execute_instance:
        (pipeline, run_config, mode, tags, _,
         _) = _check_execute_pipeline_args(
             pipeline=pipeline,
             run_config=run_config,
             mode=mode,
             preset=preset,
             tags=tags,
         )

        parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)
        check.invariant(
            parent_pipeline_run,
            "No parent run with id {parent_run_id} found in instance.".format(
                parent_run_id=parent_run_id),
        )

        step_keys_to_execute: Optional[List[str]] = None
        execution_plan: Optional[ExecutionPlan] = None
        # resolve step selection DSL queries using parent execution information
        if step_selection:
            step_keys_to_execute, execution_plan = _resolve_reexecute_step_selection(
                execute_instance,
                pipeline,
                mode,
                run_config,
                parent_pipeline_run,
                step_selection,
            )

        pipeline_run = execute_instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            execution_plan=execution_plan,
            run_config=run_config,
            mode=mode,
            tags=tags,
            solid_selection=parent_pipeline_run.solid_selection,
            solids_to_execute=parent_pipeline_run.solids_to_execute,
            # convert to frozenset https://github.com/dagster-io/dagster/issues/2914
            step_keys_to_execute=list(step_keys_to_execute)
            if step_keys_to_execute else None,
            root_run_id=parent_pipeline_run.root_run_id
            or parent_pipeline_run.run_id,
            parent_run_id=parent_pipeline_run.run_id,
        )

        return execute_run(
            pipeline,
            pipeline_run,
            execute_instance,
            raise_on_error=raise_on_error,
        )
示例#18
0
文件: field.py 项目: punneng/dagster
 def default_value(self):
     check.invariant(self.default_provided, 'Asking for default value when none was provided')
     return self._default_value
示例#19
0
def _check_execute_pipeline_args(
    pipeline: Union[PipelineDefinition, IPipeline],
    run_config: Optional[dict],
    mode: Optional[str],
    preset: Optional[str],
    tags: Optional[Dict[str, Any]],
    solid_selection: Optional[List[str]] = None,
) -> Tuple[IPipeline, Optional[dict], Optional[str], Dict[str, Any],
           FrozenSet[str], Optional[List[str]], ]:
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)

    run_config = check.opt_dict_param(run_config, "run_config")
    check.opt_str_param(mode, "mode")
    check.opt_str_param(preset, "preset")
    check.invariant(
        not (mode is not None and preset is not None),
        "You may set only one of `mode` (got {mode}) or `preset` (got {preset})."
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, "tags", key_type=str)
    check.opt_list_param(solid_selection, "solid_selection", of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                "The environment set in preset '{preset}' does not agree with the environment "
                "passed in the `run_config` argument.".format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "
                "the `solid_selection` argument: {solid_selection}".format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            "Mode {mode} does not agree with the mode set in preset '{preset}': "
            "('{preset_mode}')".format(preset=preset,
                                       preset_mode=pipeline_preset.mode,
                                       mode=mode),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                "You have attempted to execute pipeline {name} with mode {mode}. "
                "Available modes: {modes}").format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "
                "attempted to execute it without specifying a mode. Set "
                "mode property on the PipelineRun object.").format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
示例#20
0
文件: field.py 项目: punneng/dagster
 def default_value_as_json_str(self):
     check.invariant(self.default_provided, 'Asking for default value when none was provided')
     return serialize_value(self.default_value)
示例#21
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        environment_dict,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''
        from dagster_k8s import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job
        from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success

        import kubernetes

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time'
        )
        check.dict_param(environment_dict, 'environment_dict')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')
        check.bool_param(load_incluster_config, 'load_incluster_config')
        resources = check.opt_inst_param(
            resources, 'resources', kubernetes.client.V1ResourceRequirements
        )
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_keys_str = ", ".join(step_keys)

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)
        job_name = 'dagster-stepjob-%s' % k8s_name_key
        pod_name = 'dagster-stepjob-%s' % k8s_name_key

        variables = {
            'executionParams': {
                'runConfigData': environment_dict,
                'mode': mode,
                'selector': {
                    'repositoryLocationName': repo_location_name,
                    'repositoryName': repo_name,
                    'pipelineName': pipeline_run.pipeline_name,
                },
                'executionMetadata': {'runId': run_id},
                'stepKeys': step_keys,
            }
        }

        args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, 'Step keys'),
                    EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), 'Image pull secrets'
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), 'Service account name'
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobEngine,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_keys[0],
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)

        wait_for_job_success(job.metadata.name, namespace=job_namespace)
        pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobEngine,
            step_key=step_keys[0],
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)

        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
示例#22
0
def read_unary_response(output_file):
    messages = list(ipc_read_event_stream(output_file))
    check.invariant(len(messages) == 1)
    return messages[0]
示例#23
0
def get_image(name):
    """Retrieve the image information from the list defined above.
    """
    image = next((img for img in list_images() if img.image == name), None)
    check.invariant(image is not None, "could not find image {}".format(name))
    return image
示例#24
0
文件: utils.py 项目: prezi/dagster
def get_currently_upgrading_instance():
    global _UPGRADING_INSTANCE  # pylint: disable=global-statement
    check.invariant(_UPGRADING_INSTANCE is not None,
                    "currently upgrading instance not set")
    return _UPGRADING_INSTANCE
示例#25
0
 def non_scalar_type_key(self):
     check.invariant(self.kind == ConfigTypeKind.SCALAR_UNION)
     return self.type_param_keys[1]
示例#26
0
    def __init__(
        self,
        server_termination_event,
        loadable_target_origin=None,
        heartbeat=False,
        heartbeat_timeout=30,
        lazy_load_user_code=False,
        fixed_server_id=None,
    ):
        super(DagsterApiServer, self).__init__()

        check.bool_param(heartbeat, "heartbeat")
        check.int_param(heartbeat_timeout, "heartbeat_timeout")
        check.invariant(heartbeat_timeout > 0,
                        "heartbeat_timeout must be greater than 0")

        self._server_termination_event = check.inst_param(
            server_termination_event, "server_termination_event",
            ThreadingEventType)
        self._loadable_target_origin = check.opt_inst_param(
            loadable_target_origin, "loadable_target_origin",
            LoadableTargetOrigin)

        # Each server is initialized with a unique UUID. This UUID is used by clients to track when
        # servers are replaced and is used for cache invalidation and reloading.
        self._server_id = check.opt_str_param(fixed_server_id,
                                              "fixed_server_id",
                                              str(uuid.uuid4()))

        # Client tells the server to shutdown by calling ShutdownServer (or by failing to send a
        # hearbeat, at which point this event is set. The cleanup thread will then set the server
        # termination event once all current executions have finished, which will stop the server)
        self._shutdown_once_executions_finish_event = threading.Event()

        # Dict[str, (multiprocessing.Process, DagsterInstance)]
        self._executions = {}
        # Dict[str, multiprocessing.Event]
        self._termination_events = {}
        self._termination_times = {}
        self._execution_lock = threading.Lock()

        self._repository_symbols_and_code_pointers = LazyRepositorySymbolsAndCodePointers(
            loadable_target_origin)
        if not lazy_load_user_code:
            self._repository_symbols_and_code_pointers.load()

        self.__last_heartbeat_time = time.time()
        if heartbeat:
            self.__heartbeat_thread = threading.Thread(
                target=self._heartbeat_thread,
                args=(heartbeat_timeout, ),
                name="grpc-server-heartbeat",
            )
            self.__heartbeat_thread.daemon = True
            self.__heartbeat_thread.start()
        else:
            self.__heartbeat_thread = None

        self.__cleanup_thread = threading.Thread(target=self._cleanup_thread,
                                                 args=(),
                                                 name="grpc-server-cleanup")
        self.__cleanup_thread.daemon = True

        self.__cleanup_thread.start()
示例#27
0
 def inner_type_key(self):
     # valid for Noneable and Array
     check.invariant(self.kind == ConfigTypeKind.NONEABLE or self.kind == ConfigTypeKind.ARRAY)
     check.invariant(len(self.type_param_keys) == 1)
     return self.type_param_keys[0]
示例#28
0
    def __init__(
        self,
        host="localhost",
        port=None,
        socket=None,
        max_workers=None,
        loadable_target_origin=None,
        heartbeat=False,
        heartbeat_timeout=30,
        lazy_load_user_code=False,
        ipc_output_file=None,
        fixed_server_id=None,
    ):
        check.opt_str_param(host, "host")
        check.opt_int_param(port, "port")
        check.opt_str_param(socket, "socket")
        check.opt_int_param(max_workers, "max_workers")
        check.opt_inst_param(loadable_target_origin, "loadable_target_origin",
                             LoadableTargetOrigin)
        check.invariant(
            port is not None if seven.IS_WINDOWS else True,
            "You must pass a valid `port` on Windows: `socket` not supported.",
        )
        check.invariant(
            (port or socket) and not (port and socket),
            "You must pass one and only one of `port` or `socket`.",
        )
        check.invariant(
            host is not None if port else True,
            "Must provide a host when serving on a port",
        )
        check.bool_param(heartbeat, "heartbeat")
        check.int_param(heartbeat_timeout, "heartbeat_timeout")
        self._ipc_output_file = check.opt_str_param(ipc_output_file,
                                                    "ipc_output_file")
        check.opt_str_param(fixed_server_id, "fixed_server_id")

        check.invariant(heartbeat_timeout > 0,
                        "heartbeat_timeout must be greater than 0")
        check.invariant(
            max_workers is None or max_workers > 1 if heartbeat else True,
            "max_workers must be greater than 1 or set to None if heartbeat is True. "
            "If set to None, the server will use the gRPC default.",
        )

        self.server = grpc.server(ThreadPoolExecutor(max_workers=max_workers))
        self._server_termination_event = threading.Event()

        try:
            self._api_servicer = DagsterApiServer(
                server_termination_event=self._server_termination_event,
                loadable_target_origin=loadable_target_origin,
                heartbeat=heartbeat,
                heartbeat_timeout=heartbeat_timeout,
                lazy_load_user_code=lazy_load_user_code,
                fixed_server_id=fixed_server_id,
            )
        except Exception:
            if self._ipc_output_file:
                with ipc_write_stream(self._ipc_output_file) as ipc_stream:
                    ipc_stream.send(
                        GrpcServerLoadErrorEvent(
                            error_info=serializable_error_info_from_exc_info(
                                sys.exc_info())))
            raise

        # Create a health check servicer
        self._health_servicer = health.HealthServicer()
        health_pb2_grpc.add_HealthServicer_to_server(self._health_servicer,
                                                     self.server)

        add_DagsterApiServicer_to_server(self._api_servicer, self.server)

        if port:
            server_address = host + ":" + str(port)
        else:
            server_address = "unix:" + os.path.abspath(socket)

        # grpc.Server.add_insecure_port returns:
        # - 0 on failure
        # - port number when a port is successfully bound
        # - 1 when a UDS is successfully bound
        res = self.server.add_insecure_port(server_address)
        if socket and res != 1:
            if self._ipc_output_file:
                with ipc_write_stream(self._ipc_output_file) as ipc_stream:
                    ipc_stream.send(GrpcServerFailedToBindEvent())
            raise CouldNotBindGrpcServerToAddress(socket)
        if port and res != port:
            if self._ipc_output_file:
                with ipc_write_stream(self._ipc_output_file) as ipc_stream:
                    ipc_stream.send(GrpcServerFailedToBindEvent())
            raise CouldNotBindGrpcServerToAddress(port)
示例#29
0
    def execute_in_process(
        self,
        run_config: Optional[Dict[str, Any]] = None,
        instance: Optional["DagsterInstance"] = None,
        partition_key: Optional[str] = None,
        raise_on_error: bool = True,
        op_selection: Optional[List[str]] = None,
        run_id: Optional[str] = None,
    ) -> "ExecuteInProcessResult":
        """
        Execute the Job in-process, gathering results in-memory.

        The `executor_def` on the Job will be ignored, and replaced with the in-process executor.
        If using the default `io_manager`, it will switch from filesystem to in-memory.


        Args:
            run_config (Optional[Dict[str, Any]]:
                The configuration for the run
            instance (Optional[DagsterInstance]):
                The instance to execute against, an ephemeral one will be used if none provided.
            partition_key: (Optional[str])
                The string partition key that specifies the run config to execute. Can only be used
                to select run config for jobs with partitioned config.
            raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
                Defaults to ``True``.
            op_selection (Optional[List[str]]): A list of op selection queries (including single op
                names) to execute. For example:
                * ``['some_op']``: selects ``some_op`` itself.
                * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).
                * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants
                (downstream dependencies) within 3 levels down.
                * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its
                ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.
        Returns:
            :py:class:`~dagster.ExecuteInProcessResult`

        """
        from dagster.core.definitions.executor_definition import execute_in_process_executor
        from dagster.core.execution.execute_in_process import core_execute_in_process

        run_config = check.opt_dict_param(run_config, "run_config")
        op_selection = check.opt_list_param(op_selection, "op_selection", str)
        partition_key = check.opt_str_param(partition_key, "partition_key")

        check.invariant(
            len(self._mode_definitions) == 1,
            "execute_in_process only supported on job / single mode pipeline",
        )

        base_mode = self.get_mode_definition()
        # create an ephemeral in process mode by replacing the executor_def and
        # switching the default fs io_manager to in mem, if another was not set
        in_proc_mode = ModeDefinition(
            name="in_process",
            executor_defs=[execute_in_process_executor],
            resource_defs=_swap_default_io_man(base_mode.resource_defs, self),
            logger_defs=base_mode.loggers,
            _config_mapping=base_mode.config_mapping,
            _partitioned_config=base_mode.partitioned_config,
        )

        ephemeral_job = JobDefinition(
            name=self._name,
            graph_def=self._graph_def,
            mode_def=in_proc_mode,
            hook_defs=self.hook_defs,
            tags=self.tags,
            op_retry_policy=self._solid_retry_policy,
            version_strategy=self.version_strategy,
        ).get_job_def_for_op_selection(op_selection)

        tags = None
        if partition_key:
            if not base_mode.partitioned_config:
                check.failed(
                    f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"
                )
            check.invariant(
                not run_config,
                "Cannot provide both run_config and partition_key arguments to `execute_in_process`",
            )
            partition_set = self.get_partition_set_def()
            if not partition_set:
                check.failed(
                    f"Provided partition key `{partition_key}` for job `{self._name}` without a partitioned config"
                )

            partition = partition_set.get_partition(partition_key)
            run_config = partition_set.run_config_for_partition(partition)
            tags = partition_set.tags_for_partition(partition)

        return core_execute_in_process(
            node=self._graph_def,
            ephemeral_pipeline=ephemeral_job,
            run_config=run_config,
            instance=instance,
            output_capturing_enabled=True,
            raise_on_error=raise_on_error,
            run_tags=tags,
            run_id=run_id,
        )
示例#30
0
    def _from_storage(
            cls,
            pipeline_name=None,
            run_id=None,
            run_config=None,
            mode=None,
            solid_selection=None,
            solids_to_execute=None,
            step_keys_to_execute=None,
            status=None,
            tags=None,
            root_run_id=None,
            parent_run_id=None,
            pipeline_snapshot_id=None,
            execution_plan_snapshot_id=None,
            # backcompat
            environment_dict=None,
            previous_run_id=None,
            selector=None,
            solid_subset=None,
            reexecution_config=None,  # pylint: disable=unused-argument
            **kwargs):

        # serdes log
        # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve
        # * added pipeline_snapshot_id
        # * renamed previous_run_id -> parent_run_id, added root_run_id
        # * added execution_plan_snapshot_id
        # * removed selector
        # * added solid_subset
        # * renamed solid_subset -> solid_selection, added solids_to_execute
        # * renamed environment_dict -> run_config

        # back compat for environment dict => run_config
        if environment_dict:
            check.invariant(
                not run_config,
                "Cannot set both run_config and environment_dict. Use run_config parameter.",
            )
            run_config = environment_dict

        # back compat for previous_run_id => parent_run_id, root_run_id
        if previous_run_id and not (parent_run_id and root_run_id):
            parent_run_id = previous_run_id
            root_run_id = previous_run_id

        # back compat for selector => pipeline_name, solids_to_execute
        selector = check.opt_inst_param(selector, "selector",
                                        ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                ("Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: "
                 "selector was passed with pipeline {selector_pipeline}".
                 format(pipeline_name=pipeline_name,
                        selector_pipeline=selector.name)),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solids_to_execute is None
                or set(selector.solid_subset) == solids_to_execute,
                ("Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: "
                 "selector was passed with subset {selector_subset}".format(
                     solids_to_execute=solids_to_execute,
                     selector_subset=selector.solid_subset)),
            )
            # for old runs that only have selector but no solids_to_execute
            if solids_to_execute is None:
                solids_to_execute = (frozenset(selector.solid_subset)
                                     if selector.solid_subset else None)

        # back compat for solid_subset => solids_to_execute
        check.opt_list_param(solid_subset, "solid_subset", of_type=str)
        if solid_subset:
            solids_to_execute = frozenset(solid_subset)

        # warn about unused arguments
        if len(kwargs):
            warnings.warn(
                "Found unhandled arguments from stored PipelineRun: {args}".
                format(args=kwargs.keys()))

        return cls.__new__(  # pylint: disable=redundant-keyword-arg
            cls,
            pipeline_name=pipeline_name,
            run_id=run_id,
            run_config=run_config,
            mode=mode,
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=pipeline_snapshot_id,
            execution_plan_snapshot_id=execution_plan_snapshot_id,
        )