Пример #1
0
 def assign_id_and_return(self, id: str):
     if self.id:
         raise _user_exceptions.FlyteAssertion(
             f"Error assigning ID: {id} because {self} is already assigned. Has this node been ssigned to another "
             "workflow already?")
     self._id = _dnsify(id) if id else None
     self._metadata.name = id
     return self
Пример #2
0
    def __init__(self,
                 id,
                 upstream_nodes,
                 bindings,
                 metadata,
                 sdk_task=None,
                 sdk_workflow=None,
                 sdk_launch_plan=None,
                 sdk_branch=None):
        """
        :param Text id: A workflow-level unique identifier that identifies this node in the workflow. "inputs" and
            "outputs" are reserved node ids that cannot be used by other nodes.
        :param flytekit.models.core.workflow.NodeMetadata metadata: Extra metadata about the node.
        :param list[flytekit.models.literals.Binding] bindings: Specifies how to bind the underlying
            interface's inputs.  All required inputs specified in the underlying interface must be fulfilled.
        :param list[SdkNode] upstream_nodes: Specifies execution dependencies for this node ensuring it will
            only get scheduled to run after all its upstream nodes have completed. This node will have
            an implicit dependency on any node that appears in inputs field.
        :param flytekit.common.tasks.task.SdkTask sdk_task: The task to execute in this
            node.
        :param flytekit.common.workflow.SdkWorkflow sdk_workflow: The workflow to execute in this node.
        :param flytekit.common.launch_plan.SdkLaunchPlan sdk_launch_plan: The launch plan to execute in this
        node.
        :param TODO sdk_branch: TODO
        """
        non_none_entities = [
            entity for entity in
            [sdk_workflow, sdk_branch, sdk_launch_plan, sdk_task]
            if entity is not None
        ]
        if len(non_none_entities) != 1:
            raise _user_exceptions.FlyteAssertion(
                "An SDK node must have one underlying entity specified at once.  Received the following "
                "entities: {}".format(non_none_entities))

        workflow_node = None
        if sdk_workflow is not None:
            workflow_node = _component_nodes.SdkWorkflowNode(
                sdk_workflow=sdk_workflow)
        elif sdk_launch_plan is not None:
            workflow_node = _component_nodes.SdkWorkflowNode(
                sdk_launch_plan=sdk_launch_plan)

        super(SdkNode, self).__init__(
            id=_dnsify(id) if id else None,
            metadata=metadata,
            inputs=bindings,
            upstream_node_ids=[n.id for n in upstream_nodes],
            output_aliases=[],  # TODO: Are aliases a thing in SDK nodes
            task_node=_component_nodes.SdkTaskNode(sdk_task)
            if sdk_task else None,
            workflow_node=workflow_node,
            branch_node=sdk_branch.target if sdk_branch else None)
        self._upstream = upstream_nodes
        self._executable_sdk_object = sdk_task or sdk_workflow or sdk_branch or sdk_launch_plan
        self._outputs = OutputParameterMapper(
            self._executable_sdk_object.interface.outputs, self)
Пример #3
0
 def assign_id_and_return(self, id):
     """
     :param Text id:
     :rtype: None
     """
     if self.id:
         raise _user_exceptions.FlyteAssertion(
             "Error assigning ID: {} because {} is already assigned.  Has this node been assigned to another "
             "workflow already?".format(id, self))
     self._id = _dnsify(id) if id else None
     return self
Пример #4
0
 def __init__(
     self,
     id: str,
     metadata: _workflow_model.NodeMetadata,
     bindings: List[_literal_models.Binding],
     upstream_nodes: List[Node],
     flyte_entity: Any,
 ):
     self._id = _dnsify(id)
     self._metadata = metadata
     self._bindings = bindings
     self._upstream_nodes = upstream_nodes
     self._flyte_entity = flyte_entity
     self._sdk_node = None
     self._aliases: _workflow_model.Alias = None
Пример #5
0
 def __init__(
     self,
     id: str,
     metadata: _workflow_model.NodeMetadata,
     bindings: List[_literal_models.Binding],
     upstream_nodes: List[Node],
     flyte_entity: Any,
 ):
     if id is None:
         raise ValueError("Illegal construction of node, without a Node ID")
     self._id = _dnsify(id)
     self._metadata = metadata
     self._bindings = bindings
     self._upstream_nodes = upstream_nodes
     self._flyte_entity = flyte_entity
     self._aliases: _workflow_model.Alias = None
     self._outputs = None
Пример #6
0
    def __init__(
        self,
        id,
        upstream_nodes,
        bindings,
        metadata,
        flyte_task: "flytekit.control_plan.tasks.task.FlyteTask" = None,
        flyte_workflow: "flytekit.control_plane.workflow.FlyteWorkflow" = None,
        flyte_launch_plan=None,
        flyte_branch=None,
        parameter_mapping=True,
    ):
        non_none_entities = list(
            filter(
                None,
                [flyte_task, flyte_workflow, flyte_launch_plan, flyte_branch]))
        if len(non_none_entities) != 1:
            raise _user_exceptions.FlyteAssertion(
                "An Flyte node must have one underlying entity specified at once.  Received the following "
                "entities: {}".format(non_none_entities))

        workflow_node = None
        if flyte_workflow is not None:
            workflow_node = _component_nodes.FlyteWorkflowNode(
                flyte_workflow=flyte_workflow)
        elif flyte_launch_plan is not None:
            workflow_node = _component_nodes.FlyteWorkflowNode(
                flyte_launch_plan=flyte_launch_plan)

        super(FlyteNode, self).__init__(
            id=_dnsify(id) if id else None,
            metadata=metadata,
            inputs=bindings,
            upstream_node_ids=[n.id for n in upstream_nodes],
            output_aliases=[],
            task_node=_component_nodes.FlyteTaskNode(flyte_task)
            if flyte_task else None,
            workflow_node=workflow_node,
            branch_node=flyte_branch,
        )
        self._upstream = upstream_nodes
Пример #7
0
    def _produce_dynamic_job_spec(self, context, inputs):
        """
        Runs user code and and produces future task nodes to run sub-tasks.
        :param context:
        :param flytekit.models.literals.LiteralMap literal_map inputs:
        :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity])
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs, {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            })
        outputs_dict = {
            name: PromiseOutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(self.interface.outputs)
        }

        inputs_dict.update(outputs_dict)
        yielded_sub_tasks = [
            sub_task
            for sub_task in super(SdkDynamicTask, self)._execute_user_code(
                context, inputs_dict) or []
        ]

        upstream_nodes = list()
        output_bindings = [
            _literal_models.Binding(
                var=name,
                binding=_interface.BindingData.from_python_std(
                    b.sdk_type.to_flyte_literal_type(),
                    b.raw_value,
                    upstream_nodes=upstream_nodes))
            for name, b in _six.iteritems(outputs_dict)
        ]
        upstream_nodes = set(upstream_nodes)

        generated_files = {}
        # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can
        # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless.
        array_job_index = {}
        tasks = []
        nodes = []
        visited_nodes = set()
        generated_ids = {}
        effective_failure_ratio = self._allowed_failure_ratio or 0.0
        for sub_task_node in _itertools.chain(yielded_sub_tasks,
                                              upstream_nodes):
            if sub_task_node in visited_nodes:
                continue
            visited_nodes.add(sub_task_node)

            # Generate an id that's unique in the document (if the same task is used multiple times with
            # different resources, executable_sdk_object.id will be the same but generated node_ids should not
            # be.
            safe_task_id = _six.text_type(
                sub_task_node.executable_sdk_object.id)
            if safe_task_id in generated_ids:
                new_count = generated_ids[
                    safe_task_id] = generated_ids[safe_task_id] + 1
            else:
                new_count = generated_ids[safe_task_id] = 0
            unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count))

            # If the task can run as an array job, group its instances together. Otherwise, keep each invocation as a
            # separate node.
            if SdkDynamicTask._can_run_as_array(
                    sub_task_node.executable_sdk_object.type):
                if sub_task_node.executable_sdk_object in array_job_index:
                    array_job, node = array_job_index[
                        sub_task_node.executable_sdk_object]
                    array_job.size += 1
                    array_job.min_successes = int(
                        math.ceil(
                            (1 - effective_failure_ratio) * array_job.size))
                else:
                    array_job = self._create_array_job(
                        inputs_prefix=unique_node_id)
                    node = sub_task_node.assign_id_and_return(unique_node_id)
                    array_job_index[sub_task_node.executable_sdk_object] = (
                        array_job, node)

                node_index = _six.text_type(array_job.size - 1)
                for k, node_output in _six.iteritems(sub_task_node.outputs):
                    if not node_output.sdk_node.id:
                        node_output.sdk_node.assign_id_and_return(node.id)
                    node_output.var = "[{}].{}".format(node_index,
                                                       node_output.var)

                # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb
                input_path = _os.path.join(node.id, node_index,
                                           _constants.INPUT_FILE_NAME)
                generated_files[input_path] = _literal_models.LiteralMap(
                    literals={
                        binding.var: binding.binding.to_literal_model()
                        for binding in sub_task_node.inputs
                    })
            else:
                node = sub_task_node.assign_id_and_return(unique_node_id)

                tasks.append(sub_task_node.executable_sdk_object)
                nodes.append(node)

                for k, node_output in _six.iteritems(sub_task_node.outputs):
                    if not node_output.sdk_node.id:
                        node_output.sdk_node.assign_id_and_return(node.id)

                # Upload inputs to working directory under /array_job.input_ref/inputs.pb
                input_path = _os.path.join(node.id, _constants.INPUT_FILE_NAME)
                generated_files[input_path] = _literal_models.LiteralMap(
                    literals={
                        binding.var: binding.binding.to_literal_model()
                        for binding in sub_task_node.inputs
                    })

        # assign custom field to the ArrayJob properties computed.
        for task, (array_job, _) in _six.iteritems(array_job_index):
            # TODO: Reconstruct task template object instead of modifying an existing one?
            tasks.append(
                task.assign_custom_and_return(
                    array_job.to_dict()).assign_type_and_return(
                        _constants.SdkTaskType.CONTAINER_ARRAY_TASK))

        # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the
        # total length of tasks to get an absolute count.
        nodes.extend([
            array_job_node for (_, array_job_node) in array_job_index.values()
        ])
        dynamic_job_spec = _dynamic_job.DynamicJobSpec(
            min_successes=len(nodes),
            tasks=tasks,
            nodes=nodes,
            outputs=output_bindings,
            subworkflows=[])

        return dynamic_job_spec, generated_files
Пример #8
0
    def _produce_dynamic_job_spec(self, context, inputs):
        """
        Runs user code and and produces future task nodes to run sub-tasks.
        :param context:
        :param flytekit.models.literals.LiteralMap literal_map inputs:
        :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity])
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs, {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            })
        outputs_dict = {
            name: PromiseOutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(self.interface.outputs)
        }

        # Because users declare both inputs and outputs in their functions signatures, merge them together
        # before calling user code
        inputs_dict.update(outputs_dict)
        yielded_sub_tasks = [
            sub_task
            for sub_task in super(SdkDynamicTask, self)._execute_user_code(
                context, inputs_dict) or []
        ]

        upstream_nodes = list()
        output_bindings = [
            _literal_models.Binding(
                var=name,
                binding=_interface.BindingData.from_python_std(
                    b.sdk_type.to_flyte_literal_type(),
                    b.raw_value,
                    upstream_nodes=upstream_nodes))
            for name, b in _six.iteritems(outputs_dict)
        ]
        upstream_nodes = set(upstream_nodes)

        generated_files = {}
        # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can
        # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless.
        array_job_index = {}
        tasks = set()
        nodes = []
        sub_workflows = set()
        visited_nodes = set()
        generated_ids = {}
        effective_failure_ratio = self._allowed_failure_ratio or 0.0

        # TODO: This function needs to be cleaned up.
        # The reason we chain these two together is because we allow users to not have to explicitly "yield" the
        # node. As long as the subtask/lp/subwf has an output that's referenced, it'll get picked up.
        for sub_task_node in _itertools.chain(yielded_sub_tasks,
                                              upstream_nodes):
            if sub_task_node in visited_nodes:
                continue
            visited_nodes.add(sub_task_node)
            executable = sub_task_node.executable_sdk_object

            # If the executable object that we're dealing with is registerable (ie, SdkRunnableLaunchPlan, SdkWorkflow
            # SdkTask, or SdkRunnableTask), then it should have the ability to give itself a name. After assigning
            # itself the name, also make sure the id is properly set according to current config values.
            if isinstance(executable, _registerable.RegisterableEntity):
                executable.auto_assign_name()
                executable._id = _identifier.Identifier(
                    executable.resource_type,
                    _internal_config.TASK_PROJECT.get()
                    or _internal_config.PROJECT.get(),
                    _internal_config.TASK_DOMAIN.get()
                    or _internal_config.DOMAIN.get(),
                    executable.platform_valid_name,
                    _internal_config.TASK_VERSION.get()
                    or _internal_config.VERSION.get())

            # Generate an id that's unique in the document (if the same task is used multiple times with
            # different resources, executable_sdk_object.id will be the same but generated node_ids should not
            # be.
            safe_task_id = _six.text_type(
                sub_task_node.executable_sdk_object.id)
            if safe_task_id in generated_ids:
                new_count = generated_ids[
                    safe_task_id] = generated_ids[safe_task_id] + 1
            else:
                new_count = generated_ids[safe_task_id] = 0
            unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count))

            # Handling case where the yielded node is launch plan
            if isinstance(sub_task_node.executable_sdk_object,
                          _launch_plan.SdkLaunchPlan):
                node = sub_task_node.assign_id_and_return(unique_node_id)
                _append_node(generated_files, node, nodes, sub_task_node)
            # Handling case where the yielded node is launching a sub-workflow
            elif isinstance(sub_task_node.executable_sdk_object,
                            _workflow.SdkWorkflow):
                node = sub_task_node.assign_id_and_return(unique_node_id)
                _append_node(generated_files, node, nodes, sub_task_node)
                # Add the workflow itself to the yielded sub-workflows
                sub_workflows.add(sub_task_node.executable_sdk_object)
                # Recursively discover statically defined upstream entities (tasks, wfs)
                SdkDynamicTask._add_upstream_entities(
                    sub_task_node.executable_sdk_object, sub_workflows, tasks)
            # Handling tasks
            else:
                # If the task can run as an array job, group its instances together. Otherwise, keep each
                # invocation as a separate node.
                if SdkDynamicTask._can_run_as_array(
                        sub_task_node.executable_sdk_object.type):
                    if sub_task_node.executable_sdk_object in array_job_index:
                        array_job, node = array_job_index[
                            sub_task_node.executable_sdk_object]
                        array_job.size += 1
                        array_job.min_successes = int(
                            math.ceil((1 - effective_failure_ratio) *
                                      array_job.size))
                    else:
                        array_job = self._create_array_job(
                            inputs_prefix=unique_node_id)
                        node = sub_task_node.assign_id_and_return(
                            unique_node_id)
                        array_job_index[
                            sub_task_node.executable_sdk_object] = (array_job,
                                                                    node)

                    node_index = _six.text_type(array_job.size - 1)
                    for k, node_output in _six.iteritems(
                            sub_task_node.outputs):
                        if not node_output.sdk_node.id:
                            node_output.sdk_node.assign_id_and_return(node.id)
                        node_output.var = "[{}].{}".format(
                            node_index, node_output.var)

                    # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb
                    input_path = _os.path.join(node.id, node_index,
                                               _constants.INPUT_FILE_NAME)
                    generated_files[input_path] = _literal_models.LiteralMap(
                        literals={
                            binding.var: binding.binding.to_literal_model()
                            for binding in sub_task_node.inputs
                        })
                else:
                    node = sub_task_node.assign_id_and_return(unique_node_id)
                    tasks.add(sub_task_node.executable_sdk_object)
                    _append_node(generated_files, node, nodes, sub_task_node)

        # assign custom field to the ArrayJob properties computed.
        for task, (array_job, _) in _six.iteritems(array_job_index):
            # TODO: Reconstruct task template object instead of modifying an existing one?
            tasks.add(
                task.assign_custom_and_return(
                    array_job.to_dict()).assign_type_and_return(
                        _constants.SdkTaskType.CONTAINER_ARRAY_TASK))

        # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the
        # total length of tasks to get an absolute count.
        nodes.extend([
            array_job_node for (_, array_job_node) in array_job_index.values()
        ])
        dynamic_job_spec = _dynamic_job.DynamicJobSpec(
            min_successes=len(nodes),
            tasks=list(tasks),
            nodes=nodes,
            outputs=output_bindings,
            subworkflows=list(sub_workflows))

        return dynamic_job_spec, generated_files
Пример #9
0
def get_serializable_node(
    entity_mapping: OrderedDict,
    settings: SerializationSettings,
    entity: Node,
) -> workflow_model.Node:
    if entity.flyte_entity is None:
        raise Exception(f"Node {entity.id} has no flyte entity")

    upstream_sdk_nodes = [
        get_serializable(entity_mapping, settings, n)
        for n in entity.upstream_nodes
        if n.id != _common_constants.GLOBAL_INPUT_NODE_ID
    ]

    # Reference entities also inherit from the classes in the second if statement so address them first.
    if isinstance(entity.flyte_entity, ReferenceEntity):
        # This is a throw away call.
        # See the comment in compile_into_workflow in python_function_task. This is just used to place a None value
        # in the entity_mapping.
        get_serializable(entity_mapping, settings, entity.flyte_entity)
        ref = entity.flyte_entity
        node_model = workflow_model.Node(
            id=_dnsify(entity.id),
            metadata=entity.metadata,
            inputs=entity.bindings,
            upstream_node_ids=[n.id for n in upstream_sdk_nodes],
            output_aliases=[],
        )
        if ref.reference.resource_type == _identifier_model.ResourceType.TASK:
            node_model._task_node = workflow_model.TaskNode(reference_id=ref.id)
        elif ref.reference.resource_type == _identifier_model.ResourceType.WORKFLOW:
            node_model._workflow_node = workflow_model.WorkflowNode(sub_workflow_ref=ref.id)
        elif ref.reference.resource_type == _identifier_model.ResourceType.LAUNCH_PLAN:
            node_model._workflow_node = workflow_model.WorkflowNode(launchplan_ref=ref.id)
        else:
            raise Exception(f"Unexpected reference type {ref}")
        return node_model

    if isinstance(entity.flyte_entity, PythonTask):
        task_spec = get_serializable(entity_mapping, settings, entity.flyte_entity)
        node_model = workflow_model.Node(
            id=_dnsify(entity.id),
            metadata=entity.metadata,
            inputs=entity.bindings,
            upstream_node_ids=[n.id for n in upstream_sdk_nodes],
            output_aliases=[],
            task_node=workflow_model.TaskNode(
                reference_id=task_spec.template.id, overrides=TaskNodeOverrides(resources=entity._resources)
            ),
        )
        if entity._aliases:
            node_model._output_aliases = entity._aliases

    elif isinstance(entity.flyte_entity, WorkflowBase):
        wf_spec = get_serializable(entity_mapping, settings, entity.flyte_entity)
        node_model = workflow_model.Node(
            id=_dnsify(entity.id),
            metadata=entity.metadata,
            inputs=entity.bindings,
            upstream_node_ids=[n.id for n in upstream_sdk_nodes],
            output_aliases=[],
            workflow_node=workflow_model.WorkflowNode(sub_workflow_ref=wf_spec.template.id),
        )

    elif isinstance(entity.flyte_entity, BranchNode):
        node_model = workflow_model.Node(
            id=_dnsify(entity.id),
            metadata=entity.metadata,
            inputs=entity.bindings,
            upstream_node_ids=[n.id for n in upstream_sdk_nodes],
            output_aliases=[],
            branch_node=get_serializable(entity_mapping, settings, entity.flyte_entity),
        )

    elif isinstance(entity.flyte_entity, LaunchPlan):
        lp_spec = get_serializable(entity_mapping, settings, entity.flyte_entity)

        node_model = workflow_model.Node(
            id=_dnsify(entity.id),
            metadata=entity.metadata,
            inputs=entity.bindings,
            upstream_node_ids=[n.id for n in upstream_sdk_nodes],
            output_aliases=[],
            workflow_node=workflow_model.WorkflowNode(launchplan_ref=lp_spec.id),
        )
    else:
        raise Exception(f"Node contained non-serializable entity {entity._flyte_entity}")

    return node_model