def assign_metrics(nb_graph: nx.DiGraph, pipeline_metrics: dict): """Assign pipeline metrics to specific pipeline steps. This assignment follows a similar logic to the detection of `out` dependencies. Starting from a temporary step - child of all the leaf nodes, all the nodes in the pipelines are traversed in reversed topological order. When a step shows one of the metrics as part of its code, then that metric is assigned to the step. Args: nb_graph: nx DiGraph with pipeline code blocks pipeline_metrics (dict): a dict of pipeline metrics where the key is the KFP sanitized name and the value the name of the original variable. """ # create a temporary step at the end of the pipeline to simplify the # iteration from the leaf steps tmp_step = "_tmp" leaf_steps = graphutils.get_leaf_nodes(nb_graph) if not leaf_steps: return [nb_graph.add_edge(node, tmp_step) for node in leaf_steps] # pipeline_metrics is a dict having sanitized variable names as keys and # the corresponding variable names as values. Here we need to refer to # the sanitized names using the python variables. # XXX: We could change parse_metrics_print_statements() to return the # XXX: reverse dictionary, but that would require changing either # XXX: rpc.nb.get_pipeline_metrics() or change in the JupyterLab Extension # XXX: parsing of the RPC result rev_pipeline_metrics = {v: k for k, v in pipeline_metrics.items()} metrics_left = set(rev_pipeline_metrics.keys()) for anc in graphutils.get_ordered_ancestors(nb_graph, tmp_step): if not metrics_left: break anc_data = nb_graph.nodes(data=True)[anc] anc_source = '\n'.join(anc_data['source']) # get all the marshal candidates from father's source and intersect # with the metrics that have not been matched yet marshal_candidates = kale_ast.get_marshal_candidates(anc_source) assigned_metrics = metrics_left.intersection(marshal_candidates) # Remove the metrics that have already been assigned. metrics_left.difference_update(assigned_metrics) # Generate code to produce the metrics artifact in the current step if assigned_metrics: code = METRICS_TEMPLATE % (" " + ",\n ".join([ '"%s": %s' % (rev_pipeline_metrics[x], x) for x in sorted(assigned_metrics) ])) anc_data['source'].append(code) # need to have a `metrics` flag set to true in order to set the # metrics output artifact in the pipeline template nx.set_node_attributes(nb_graph, {anc: {'metrics': True}}) nb_graph.remove_node(tmp_step)
def notebook_to_graph(self): """Convert an annotated Notebook to a Graph.""" # convert notebook to nx graph (pipeline_graph, pipeline_parameters_source, pipeline_metrics_source, imports_and_functions) = parser.parse_notebook( self.notebook, self.pipeline_metadata) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # run static analysis over the source code dependencies.dependencies_detection( pipeline_graph, pipeline_parameters=pipeline_parameters_dict, imports_and_functions=imports_and_functions) dependencies.assign_metrics(pipeline_graph, pipeline_metrics) # if there are multiple DAG leaves, add an empty step at the end of the # pipeline for final snapshot leaf_steps = graphutils.get_leaf_nodes(pipeline_graph) if self.pipeline_metadata.get("autosnapshot") and len(leaf_steps) > 1: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) step_defaults = parser.parse_steps_defaults( self.pipeline_metadata.get("steps_defaults", [])) data = { auto_snapshot_name: { "source": "", "ins": [], "outs": [], "annotations": step_defaults.get("annotations"), "labels": step_defaults.get("labels"), "limits": step_defaults.get("limits") } } nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict
def generate_pipeline(template, nb_graph, step_names, lightweight_components, metadata): """Use the pipeline template to generate Python code.""" # All the Pipeline steps that do not have children leaf_steps = graphutils.get_leaf_nodes(nb_graph) # create a dict with step names and their parameters all_step_parameters = { step: sorted(nb_graph.nodes(data=True)[step].get('parameters', {}).keys()) for step in step_names } pipeline_code = template.render( nb_graph=nb_graph, lightweight_components=lightweight_components, step_names=step_names, step_prevs=pipeline_dependencies_tasks(nb_graph), leaf_steps=leaf_steps, all_step_parameters=all_step_parameters, **metadata) # fix code style using pep8 guidelines return autopep8.fix_code(pipeline_code)