def populate_metrics_in_dag_outputs( tasks: List[pipeline_task.PipelineTask], task_name_to_parent_groups: Mapping[str, List[_GroupOrTask]], task_name_to_task_spec: Mapping[str, pipeline_spec_pb2.PipelineTaskSpec], task_name_to_component_spec: Mapping[str, pipeline_spec_pb2.ComponentSpec], pipeline_spec: pipeline_spec_pb2.PipelineSpec, ) -> None: """Populates metrics artifacts in DAG outputs. Args: tasks: The list of tasks that may produce metrics outputs. task_name_to_parent_groups: The dict of task name to parent groups. Key is the task's name. Value is a list of ancestor groups including the task itself. The list of a given op is sorted in a way that the farthest group is the first and the task itself is the last. task_name_to_task_spec: The dict of task name to PipelineTaskSpec. task_name_to_component_spec: The dict of task name to ComponentSpec. pipeline_spec: The pipeline_spec to update in-place. """ for task in tasks: component_spec = task_name_to_component_spec[task.name] # Get the tuple of (component_name, task_name) of all its parent groups. parent_components_and_tasks = [('_root', '')] # skip the op itself and the root group which cannot be retrived via name. for group_name in task_name_to_parent_groups[task.name][1:-1]: parent_components_and_tasks.append( (component_utils.sanitize_component_name(group_name), component_utils.sanitize_task_name(group_name))) # Reverse the order to make the farthest group in the end. parent_components_and_tasks.reverse() for output_name, artifact_spec in \ component_spec.output_definitions.artifacts.items(): if artifact_spec.artifact_type.WhichOneof( 'kind' ) == 'schema_title' and artifact_spec.artifact_type.schema_title in [ artifact_types.Metrics.TYPE_NAME, artifact_types.ClassificationMetrics.TYPE_NAME, ]: unique_output_name = '{}-{}'.format(task.name, output_name) sub_task_name = task.name sub_task_output = output_name for component_name, task_name in parent_components_and_tasks: group_component_spec = ( pipeline_spec.root if component_name == '_root' else pipeline_spec.components[component_name]) group_component_spec.output_definitions.artifacts[ unique_output_name].CopyFrom(artifact_spec) group_component_spec.dag.outputs.artifacts[ unique_output_name].artifact_selectors.append( pipeline_spec_pb2.DagOutputsSpec. ArtifactSelectorSpec( producer_subtask=sub_task_name, output_artifact_key=sub_task_output, )) sub_task_name = task_name sub_task_output = unique_output_name
def build_task_spec_for_group( group: tasks_group.TasksGroup, pipeline_channels: List[pipeline_channel.PipelineChannel], tasks_in_current_dag: List[str], is_parent_component_root: bool, ) -> pipeline_spec_pb2.PipelineTaskSpec: """Builds PipelineTaskSpec for a group. Args: group: The group to build PipelineTaskSpec for. pipeline_channels: The list of pipeline channels referenced by the group. tasks_in_current_dag: The list of tasks names for tasks in the same dag. is_parent_component_root: Whether the parent component is the pipeline's root dag. Returns: A PipelineTaskSpec object representing the group. """ pipeline_task_spec = pipeline_spec_pb2.PipelineTaskSpec() pipeline_task_spec.task_info.name = group.display_name or group.name pipeline_task_spec.component_ref.name = ( component_utils.sanitize_component_name(group.name)) for channel in pipeline_channels: channel_full_name = channel.full_name subvar_name = None if isinstance(channel, for_loop.LoopArgumentVariable): channel_full_name = channel.loop_argument.full_name subvar_name = channel.subvar_name input_name = _additional_input_name_for_pipeline_channel(channel) channel_name = channel.name if subvar_name: pipeline_task_spec.inputs.parameters[ input_name].parameter_expression_selector = ( 'parseJson(string_value)["{}"]'.format(subvar_name)) if not channel.is_with_items_loop_argument: channel_name = channel.items_or_pipeline_channel.name if isinstance(channel, pipeline_channel.PipelineArtifactChannel): if channel.task_name and channel.task_name in tasks_in_current_dag: pipeline_task_spec.inputs.artifacts[ input_name].task_output_artifact.producer_task = ( component_utils.sanitize_task_name(channel.task_name)) pipeline_task_spec.inputs.artifacts[ input_name].task_output_artifact.output_artifact_key = ( channel_name) else: pipeline_task_spec.inputs.artifacts[ input_name].component_input_artifact = ( channel_full_name if is_parent_component_root else input_name) else: # channel is one of PipelineParameterChannel, LoopArgument, or # LoopArgumentVariable if channel.task_name and channel.task_name in tasks_in_current_dag: pipeline_task_spec.inputs.parameters[ input_name].task_output_parameter.producer_task = ( component_utils.sanitize_task_name(channel.task_name)) pipeline_task_spec.inputs.parameters[ input_name].task_output_parameter.output_parameter_key = ( channel_name) else: pipeline_task_spec.inputs.parameters[ input_name].component_input_parameter = ( channel_full_name if is_parent_component_root else _additional_input_name_for_pipeline_channel( channel_full_name)) if isinstance(group, tasks_group.ParallelFor): _update_task_spec_for_loop_group( group=group, pipeline_task_spec=pipeline_task_spec, ) elif isinstance(group, tasks_group.Condition): _update_task_spec_for_condition_group( group=group, pipeline_task_spec=pipeline_task_spec, ) return pipeline_task_spec
def build_task_spec_for_task( task: pipeline_task.PipelineTask, parent_component_inputs: pipeline_spec_pb2.ComponentInputsSpec, tasks_in_current_dag: List[str], input_parameters_in_current_dag: List[str], input_artifacts_in_current_dag: List[str], ) -> pipeline_spec_pb2.PipelineTaskSpec: """Builds PipelineTaskSpec for a pipeline task. A task input may reference an output outside its immediate DAG. For instance:: random_num = random_num_op(...) with dsl.Condition(random_num.output > 5): print_op('%s > 5' % random_num.output) In this example, `dsl.Condition` forms a subDAG with one task from `print_op` inside the subDAG. The task of `print_op` references output from `random_num` task, which is outside the sub-DAG. When compiling to IR, such cross DAG reference is disallowed. So we need to "punch a hole" in the sub-DAG to make the input available in the subDAG component inputs if it's not already there, Next, we can call this method to fix the tasks inside the subDAG to make them reference the component inputs instead of directly referencing the original producer task. Args: task: The task to build a PipelineTaskSpec for. parent_component_inputs: The task's parent component's input specs. tasks_in_current_dag: The list of tasks names for tasks in the same dag. input_parameters_in_current_dag: The list of input parameters in the DAG component. input_artifacts_in_current_dag: The list of input artifacts in the DAG component. Returns: A PipelineTaskSpec object representing the task. """ pipeline_task_spec = pipeline_spec_pb2.PipelineTaskSpec() pipeline_task_spec.task_info.name = (task.task_spec.display_name or task.name) # Use task.name for component_ref.name because we may customize component # spec for individual tasks to work around the lack of optional inputs # support in IR. pipeline_task_spec.component_ref.name = ( component_utils.sanitize_component_name(task.name)) pipeline_task_spec.caching_options.enable_cache = ( task.task_spec.enable_caching) for input_name, input_value in task.inputs.items(): if isinstance(input_value, pipeline_channel.PipelineArtifactChannel): if input_value.task_name: # Value is produced by an upstream task. if input_value.task_name in tasks_in_current_dag: # Dependent task within the same DAG. pipeline_task_spec.inputs.artifacts[ input_name].task_output_artifact.producer_task = ( component_utils.sanitize_task_name( input_value.task_name)) pipeline_task_spec.inputs.artifacts[ input_name].task_output_artifact.output_artifact_key = ( input_value.name) else: # Dependent task not from the same DAG. component_input_artifact = ( _additional_input_name_for_pipeline_channel( input_value)) assert component_input_artifact in parent_component_inputs.artifacts, \ 'component_input_artifact: {} not found. All inputs: {}'.format( component_input_artifact, parent_component_inputs) pipeline_task_spec.inputs.artifacts[ input_name].component_input_artifact = ( component_input_artifact) else: raise RuntimeError( f'Artifacts must be produced by a task. Got {input_value}.' ) elif isinstance(input_value, pipeline_channel.PipelineParameterChannel): if input_value.task_name: # Value is produced by an upstream task. if input_value.task_name in tasks_in_current_dag: # Dependent task within the same DAG. pipeline_task_spec.inputs.parameters[ input_name].task_output_parameter.producer_task = ( component_utils.sanitize_task_name( input_value.task_name)) pipeline_task_spec.inputs.parameters[ input_name].task_output_parameter.output_parameter_key = ( input_value.name) else: # Dependent task not from the same DAG. component_input_parameter = ( _additional_input_name_for_pipeline_channel( input_value)) assert component_input_parameter in parent_component_inputs.parameters, \ 'component_input_parameter: {} not found. All inputs: {}'.format( component_input_parameter, parent_component_inputs) pipeline_task_spec.inputs.parameters[ input_name].component_input_parameter = ( component_input_parameter) else: # Value is from pipeline input. component_input_parameter = input_value.full_name if component_input_parameter not in parent_component_inputs.parameters: component_input_parameter = ( _additional_input_name_for_pipeline_channel( input_value)) pipeline_task_spec.inputs.parameters[ input_name].component_input_parameter = ( component_input_parameter) elif isinstance(input_value, for_loop.LoopArgument): component_input_parameter = ( _additional_input_name_for_pipeline_channel(input_value)) assert component_input_parameter in parent_component_inputs.parameters, \ 'component_input_parameter: {} not found. All inputs: {}'.format( component_input_parameter, parent_component_inputs) pipeline_task_spec.inputs.parameters[ input_name].component_input_parameter = ( component_input_parameter) elif isinstance(input_value, for_loop.LoopArgumentVariable): component_input_parameter = ( _additional_input_name_for_pipeline_channel( input_value.loop_argument)) assert component_input_parameter in parent_component_inputs.parameters, \ 'component_input_parameter: {} not found. All inputs: {}'.format( component_input_parameter, parent_component_inputs) pipeline_task_spec.inputs.parameters[ input_name].component_input_parameter = ( component_input_parameter) pipeline_task_spec.inputs.parameters[ input_name].parameter_expression_selector = ( 'parseJson(string_value)["{}"]'.format( input_value.subvar_name)) elif isinstance(input_value, str): # Handle extra input due to string concat pipeline_channels = ( pipeline_channel.extract_pipeline_channels_from_any( input_value)) for channel in pipeline_channels: # value contains PipelineChannel placeholders which needs to be # replaced. And the input needs to be added to the task spec. # Form the name for the compiler injected input, and make sure it # doesn't collide with any existing input names. additional_input_name = ( _additional_input_name_for_pipeline_channel(channel)) # We don't expect collision to happen because we prefix the name # of additional input with 'pipelinechannel--'. But just in case # collision did happend, throw a RuntimeError so that we don't # get surprise at runtime. for existing_input_name, _ in task.inputs.items(): if existing_input_name == additional_input_name: raise RuntimeError( 'Name collision between existing input name ' '{} and compiler injected input name {}'.format( existing_input_name, additional_input_name)) additional_input_placeholder = ( placeholders.input_parameter_placeholder( additional_input_name)) input_value = input_value.replace( channel.pattern, additional_input_placeholder) if channel.task_name: # Value is produced by an upstream task. if channel.task_name in tasks_in_current_dag: # Dependent task within the same DAG. pipeline_task_spec.inputs.parameters[ additional_input_name].task_output_parameter.producer_task = ( component_utils.sanitize_task_name( channel.task_name)) pipeline_task_spec.inputs.parameters[ input_name].task_output_parameter.output_parameter_key = ( channel.name) else: # Dependent task not from the same DAG. component_input_parameter = ( _additional_input_name_for_pipeline_channel( channel)) assert component_input_parameter in parent_component_inputs.parameters, \ 'component_input_parameter: {} not found. All inputs: {}'.format( component_input_parameter, parent_component_inputs) pipeline_task_spec.inputs.parameters[ additional_input_name].component_input_parameter = ( component_input_parameter) else: # Value is from pipeline input. (or loop?) component_input_parameter = channel.full_name if component_input_parameter not in parent_component_inputs.parameters: component_input_parameter = ( _additional_input_name_for_pipeline_channel( channel)) pipeline_task_spec.inputs.parameters[ additional_input_name].component_input_parameter = ( component_input_parameter) pipeline_task_spec.inputs.parameters[ input_name].runtime_value.constant.string_value = input_value elif isinstance(input_value, (str, int, float, bool, dict, list)): pipeline_task_spec.inputs.parameters[ input_name].runtime_value.constant.CopyFrom( _to_protobuf_value(input_value)) else: raise ValueError( 'Input argument supports only the following types: ' 'str, int, float, bool, dict, and list.' f'Got {input_value} of type {type(input_value)}.') return pipeline_task_spec
def _create_pipeline_spec( self, pipeline_args: List[dsl.PipelineChannel], pipeline: pipeline_context.Pipeline, ) -> pipeline_spec_pb2.PipelineSpec: """Creates a pipeline spec object. Args: pipeline_args: The list of pipeline input parameters. pipeline: The instantiated pipeline object. Returns: A PipelineSpec proto representing the compiled pipeline. Raises: ValueError if the argument is of unsupported types. """ builder.validate_pipeline_name(pipeline.name) deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig() pipeline_spec = pipeline_spec_pb2.PipelineSpec() pipeline_spec.pipeline_info.name = pipeline.name pipeline_spec.sdk_version = f'kfp-{kfp.__version__}' # Schema version 2.1.0 is required for kfp-pipeline-spec>0.1.13 pipeline_spec.schema_version = '2.1.0' pipeline_spec.root.CopyFrom( builder.build_component_spec_for_group( pipeline_channels=pipeline_args, is_root_group=True, )) root_group = pipeline.groups[0] all_groups = self._get_all_groups(root_group) group_name_to_group = {group.name: group for group in all_groups} task_name_to_parent_groups, group_name_to_parent_groups = ( builder.get_parent_groups(root_group)) condition_channels = self._get_condition_channels_for_tasks(root_group) name_to_for_loop_group = { group_name: group for group_name, group in group_name_to_group.items() if isinstance(group, dsl.ParallelFor) } inputs = self._get_inputs_for_all_groups( pipeline=pipeline, pipeline_args=pipeline_args, root_group=root_group, task_name_to_parent_groups=task_name_to_parent_groups, group_name_to_parent_groups=group_name_to_parent_groups, condition_channels=condition_channels, name_to_for_loop_group=name_to_for_loop_group, ) dependencies = self._get_dependencies( pipeline=pipeline, root_group=root_group, task_name_to_parent_groups=task_name_to_parent_groups, group_name_to_parent_groups=group_name_to_parent_groups, group_name_to_group=group_name_to_group, condition_channels=condition_channels, ) for group in all_groups: builder.build_spec_by_group( pipeline_spec=pipeline_spec, deployment_config=deployment_config, group=group, inputs=inputs, dependencies=dependencies, rootgroup_name=root_group.name, task_name_to_parent_groups=task_name_to_parent_groups, group_name_to_parent_groups=group_name_to_parent_groups, name_to_for_loop_group=name_to_for_loop_group, ) # TODO: refactor to support multiple exit handler per pipeline. if pipeline.groups[0].groups: first_group = pipeline.groups[0].groups[0] if isinstance(first_group, dsl.ExitHandler): exit_task = first_group.exit_task exit_task_name = component_utils.sanitize_task_name( exit_task.name) exit_handler_group_task_name = component_utils.sanitize_task_name( first_group.name) input_parameters_in_current_dag = [ input_name for input_name in pipeline_spec.root.input_definitions.parameters ] exit_task_task_spec = builder.build_task_spec_for_exit_task( task=exit_task, dependent_task=exit_handler_group_task_name, pipeline_inputs=pipeline_spec.root.input_definitions, ) exit_task_component_spec = builder.build_component_spec_for_exit_task( task=exit_task) exit_task_container_spec = builder.build_container_spec_for_task( task=exit_task) # Add exit task task spec pipeline_spec.root.dag.tasks[exit_task_name].CopyFrom( exit_task_task_spec) # Add exit task component spec if it does not exist. component_name = exit_task_task_spec.component_ref.name if component_name not in pipeline_spec.components: pipeline_spec.components[component_name].CopyFrom( exit_task_component_spec) # Add exit task container spec if it does not exist. executor_label = exit_task_component_spec.executor_label if executor_label not in deployment_config.executors: deployment_config.executors[ executor_label].container.CopyFrom( exit_task_container_spec) pipeline_spec.deployment_spec.update( json_format.MessageToDict(deployment_config)) return pipeline_spec
def test_sanitize_task_name(self): self.assertEqual('my-component-1', utils.sanitize_task_name('My component 1'))
def _build_spec_by_group( self, pipeline_spec: pipeline_spec_pb2.PipelineSpec, deployment_config: pipeline_spec_pb2.PipelineDeploymentConfig, group: tasks_group.TasksGroup, inputs: Mapping[str, List[Tuple[dsl.PipelineChannel, str]]], dependencies: Dict[str, List[_GroupOrTask]], rootgroup_name: str, task_name_to_parent_groups: Mapping[str, List[_GroupOrTask]], group_name_to_parent_groups: Mapping[str, List[tasks_group.TasksGroup]], name_to_for_loop_group: Mapping[str, dsl.ParallelFor], ) -> None: """Generates IR spec given a TasksGroup. Args: pipeline_spec: The pipeline_spec to update in place. deployment_config: The deployment_config to hold all executors. The spec is updated in place. group: The TasksGroup to generate spec for. inputs: The inputs dictionary. The keys are group/task names and the values are lists of tuples (channel, producing_task_name). dependencies: The group dependencies dictionary. The keys are group or task names, and the values are lists of dependent groups or tasks. rootgroup_name: The name of the group root. Used to determine whether the component spec for the current group should be the root dag. task_name_to_parent_groups: The dict of task name to parent groups. Key is task name. Value is a list of ancestor groups including the task itself. The list of a given task is sorted in a way that the farthest group is the first and the task itself is the last. group_name_to_parent_groups: The dict of group name to parent groups. Key is the group name. Value is a list of ancestor groups including the group itself. The list of a given group is sorted in a way that the farthest group is the first and the group itself is the last. name_to_for_loop_group: The dict of for loop group name to loop group. """ group_component_name = component_utils.sanitize_component_name( group.name) if group.name == rootgroup_name: group_component_spec = pipeline_spec.root else: group_component_spec = pipeline_spec.components[ group_component_name] task_name_to_task_spec = {} task_name_to_component_spec = {} # Generate task specs and component specs for the dag. subgroups = group.groups + group.tasks for subgroup in subgroups: subgroup_inputs = inputs.get(subgroup.name, []) subgroup_channels = [channel for channel, _ in subgroup_inputs] subgroup_component_name = (component_utils.sanitize_component_name( subgroup.name)) tasks_in_current_dag = [ component_utils.sanitize_task_name(subgroup.name) for subgroup in subgroups ] input_parameters_in_current_dag = [ input_name for input_name in group_component_spec.input_definitions.parameters ] input_artifacts_in_current_dag = [ input_name for input_name in group_component_spec.input_definitions.artifacts ] is_parent_component_root = ( group_component_spec == pipeline_spec.root) if isinstance(subgroup, pipeline_task.PipelineTask): subgroup_task_spec = builder.build_task_spec_for_task( task=subgroup, parent_component_inputs=group_component_spec. input_definitions, tasks_in_current_dag=tasks_in_current_dag, input_parameters_in_current_dag= input_parameters_in_current_dag, input_artifacts_in_current_dag= input_artifacts_in_current_dag, ) task_name_to_task_spec[subgroup.name] = subgroup_task_spec subgroup_component_spec = builder.build_component_spec_for_task( task=subgroup) task_name_to_component_spec[ subgroup.name] = subgroup_component_spec executor_label = subgroup_component_spec.executor_label if executor_label not in deployment_config.executors: if subgroup.container_spec is not None: subgroup_container_spec = builder.build_container_spec_for_task( task=subgroup) deployment_config.executors[ executor_label].container.CopyFrom( subgroup_container_spec) elif subgroup.importer_spec is not None: subgroup_importer_spec = builder.build_importer_spec_for_task( task=subgroup) deployment_config.executors[ executor_label].importer.CopyFrom( subgroup_importer_spec) elif isinstance(subgroup, dsl.ParallelFor): # "Punch the hole", adding additional inputs (other than loop # arguments which will be handled separately) needed by its # subgroups or tasks. loop_subgroup_channels = [] for channel in subgroup_channels: # Skip 'withItems' loop arguments if it's from an inner loop. if isinstance( channel, (for_loop.LoopArgument, for_loop.LoopArgumentVariable )) and channel.is_with_items_loop_argument: withitems_loop_arg_found_in_self_or_upstream = False for group_name in group_name_to_parent_groups[ subgroup.name][::-1]: if group_name in name_to_for_loop_group: loop_group = name_to_for_loop_group[group_name] if channel.name in loop_group.loop_argument.name: withitems_loop_arg_found_in_self_or_upstream = True break if not withitems_loop_arg_found_in_self_or_upstream: continue loop_subgroup_channels.append(channel) if subgroup.items_is_pipeline_channel: # This loop_argument is based on a pipeline channel, i.e., # rather than a static list, it is either the output of # another task or an input as global pipeline parameters. loop_subgroup_channels.append( subgroup.loop_argument.items_or_pipeline_channel) loop_subgroup_channels.append(subgroup.loop_argument) subgroup_component_spec = builder.build_component_spec_for_group( pipeline_channels=loop_subgroup_channels, is_root_group=False, ) subgroup_task_spec = builder.build_task_spec_for_group( group=subgroup, pipeline_channels=loop_subgroup_channels, tasks_in_current_dag=tasks_in_current_dag, is_parent_component_root=is_parent_component_root, ) elif isinstance(subgroup, dsl.Condition): # "Punch the hole", adding inputs needed by its subgroups or # tasks. condition_subgroup_channels = list(subgroup_channels) for operand in [ subgroup.condition.left_operand, subgroup.condition.right_operand, ]: if isinstance(operand, dsl.PipelineChannel): condition_subgroup_channels.append(operand) subgroup_component_spec = builder.build_component_spec_for_group( pipeline_channels=condition_subgroup_channels, is_root_group=False, ) subgroup_task_spec = builder.build_task_spec_for_group( group=subgroup, pipeline_channels=condition_subgroup_channels, tasks_in_current_dag=tasks_in_current_dag, is_parent_component_root=is_parent_component_root, ) elif isinstance(subgroup, dsl.ExitHandler): subgroup_component_spec = builder.build_component_spec_for_group( pipeline_channels=subgroup_channels, is_root_group=False, ) subgroup_task_spec = builder.build_task_spec_for_group( group=subgroup, pipeline_channels=subgroup_channels, tasks_in_current_dag=tasks_in_current_dag, is_parent_component_root=is_parent_component_root, ) else: raise RuntimeError( f'Unexpected task/group type: Got {subgroup} of type ' f'{type(subgroup)}.') # Generate dependencies section for this task. if dependencies.get(subgroup.name, None): group_dependencies = list(dependencies[subgroup.name]) group_dependencies.sort() subgroup_task_spec.dependent_tasks.extend([ component_utils.sanitize_task_name(dep) for dep in group_dependencies ]) # Add component spec if not exists if subgroup_component_name not in pipeline_spec.components: pipeline_spec.components[subgroup_component_name].CopyFrom( subgroup_component_spec) # Add task spec group_component_spec.dag.tasks[subgroup.name].CopyFrom( subgroup_task_spec) pipeline_spec.deployment_spec.update( json_format.MessageToDict(deployment_config)) # Surface metrics outputs to the top. builder.populate_metrics_in_dag_outputs( tasks=group.tasks, task_name_to_parent_groups=task_name_to_parent_groups, task_name_to_task_spec=task_name_to_task_spec, task_name_to_component_spec=task_name_to_component_spec, pipeline_spec=pipeline_spec, )