def assign_metrics(self, pipeline_metrics: dict): """Assign pipeline metrics to specific pipeline steps. This assignment follows a similar logic to the detection of `out` dependencies. Starting from a temporary step - child of all the leaf nodes, all the nodes in the pipelines are traversed in reversed topological order. When a step shows one of the metrics as part of its code, then that metric is assigned to the step. Args: pipeline_metrics (dict): a dict of pipeline metrics where the key always the KFP sanitized name and the value the name of the original variable. """ # create a temporary step at the end of the pipeline to simplify the # iteration from the leaf steps tmp_step_name = "_tmp" leaf_steps = self.pipeline.get_leaf_steps() if not leaf_steps: return [ self.pipeline.add_edge(step.name, tmp_step_name) for step in leaf_steps ] # pipeline_metrics is a dict having sanitized variable names as keys # and the corresponding variable names as values. Here we need to refer # to the sanitized names using the python variables. # XXX: We could change parse_metrics_print_statements() to return the # XXX: reverse dictionary, but that would require changing either # XXX: rpc.nb.get_pipeline_metrics() or change in the JupyterLab # XXX: Extension parsing of the RPC result rev_pipeline_metrics = {v: k for k, v in pipeline_metrics.items()} metrics_left = set(rev_pipeline_metrics.keys()) for anc in graphutils.get_ordered_ancestors(self.pipeline, tmp_step_name): if not metrics_left: break anc_step = self.pipeline.get_step(anc) anc_source = '\n'.join(anc_step.source) # get all the marshal candidates from father's source and intersect # with the metrics that have not been matched yet marshal_candidates = astutils.get_marshal_candidates(anc_source) assigned_metrics = metrics_left.intersection(marshal_candidates) # Remove the metrics that have already been assigned. metrics_left.difference_update(assigned_metrics) # Generate code to produce the metrics artifact in the current step if assigned_metrics: code = METRICS_TEMPLATE % (" " + ",\n ".join([ '"%s": %s' % (rev_pipeline_metrics[x], x) for x in sorted(assigned_metrics) ])) anc_step.source.append(code) # need to have a `metrics` flag set to true in order to set the # metrics output artifact in the pipeline template anc_step.metrics = True self.pipeline.remove_node(tmp_step_name)
def get_ordered_ancestors(self, step_name: str) -> Iterable[Step]: """Return the ancestors of a step in an ordered manner. Wrapper of graphutils.get_ordered_ancestors. Returns: Iterable[Step]: A Steps iterable. """ return self._steps_iterable( graphutils.get_ordered_ancestors(self, step_name))
def test_get_ordered_ancestors(): """Test that the ancestors are retrieved in the expected order.""" g = nx.DiGraph() # Layer 1 g.add_edge("A", "B") # Layer 2 g.add_edge("B", "C") g.add_edge("B", "D") g.add_edge("B", "E") # Layer 3 g.add_edge("C", "R") g.add_edge("D", "R") g.add_edge("E", "R") ancs = ["B", "A"] assert graphutils.get_ordered_ancestors(g, "E") == ancs ancs = ["B", "A"] assert graphutils.get_ordered_ancestors(g, "C") == ancs ancs = ["C", "D", "E", "B", "A"] assert graphutils.get_ordered_ancestors(g, "R") == ancs
def dependencies_detection(self, imports_and_functions: str = ""): """Detect the data dependencies between nodes in the graph. The data dependencies detection algorithm roughly works as follows: 1. Traversing the graph in topological order, for every node `step` do 2. Detect the `ins` of current `step` by running PyFlakes on the source code. During this action the pipeline parameters are taken into consideration 3. Parse `step`'s global function definitions to get free variables (i.e. variables that would need to be marshalled in other steps that call these functions) - in this action pipeline parameters are taken into consideration. 4. Get all the function that `step` calls 5. For every `step`'s ancestor `anc` do - Get all the potential names (objects, functions, ...) of `anc` that could be marshalled (saved) - Intersect this with the `step`'s `ins` (from action 2) and add the result to `anc`'s `outs`. - for every `step`'s function call (action 4), check if this function was defined in `anc` and if it has free variables (action 3). If so, add to `step`'s `ins` and to `anc`'s `outs` these free variables. Args: imports_and_functions: Multiline Python source that is prepended to every pipeline step Returns: annotated graph """ # resolve the data dependencies between steps, looping through the # graph for step in self.pipeline.steps: # detect the INS dependencies of the CURRENT node------------------ step_source = '\n'.join(step.source) # get the variables that this step is missing and the pipeline # parameters that it actually needs. ins, parameters = self._detect_in_dependencies( source_code=step_source, pipeline_parameters=self.pipeline.pipeline_parameters) fns_free_variables = self._detect_fns_free_variables( step_source, imports_and_functions, self.pipeline.pipeline_parameters) # Get all the function calls. This will be used below to check if # any of the ancestors declare any of these functions. Is that is # so, the free variables of those functions will have to be loaded. fn_calls = astutils.get_function_calls(step_source) # add OUT dependencies annotations in the PARENT nodes------------- # Intersect the missing names of this father's child with all # the father's names. The intersection is the list of variables # that the father need to serialize # The ancestors are the the nodes that have a path to `step`, # ordered by path length. ins_left = ins.copy() for anc in (graphutils.get_ordered_ancestors( self.pipeline, step.name)): if not ins_left: # if there are no more variables that need to be # marshalled, stop the graph traverse break anc_step = self.pipeline.get_step(anc) anc_source = '\n'.join(anc_step.source) # get all the marshal candidates from father's source and # intersect with the required names of the current node marshal_candidates = astutils.get_marshal_candidates( anc_source) outs = ins_left.intersection(marshal_candidates) # Remove the ins that have already been assigned to an ancestor ins_left.difference_update(outs) # Include free variables to_remove = set() for fn_call in fn_calls: anc_fns_free_vars = anc_step.fns_free_variables if fn_call in anc_fns_free_vars.keys(): # the current step needs to load these variables fn_free_vars, used_params = anc_fns_free_vars[fn_call] # search if this function calls other functions (i.e. # if its free variables are found in the free variables # dict) _left = list(fn_free_vars) while _left: _cur = _left.pop(0) # if the free var is itself a fn with free vars if _cur in anc_fns_free_vars: fn_free_vars.update(anc_fns_free_vars[_cur][0]) _left = _left + list( anc_fns_free_vars[_cur][0]) ins.update(fn_free_vars) # the current ancestor needs to save these variables outs.update(fn_free_vars) # add the parameters used by the function to the list # of pipeline parameters used by the step _pps = self.pipeline.pipeline_parameters for param in used_params: parameters[param] = _pps[param] # Remove this function as it has been served. We don't # want other ancestors to save free variables for this # function. Using the helper to_remove because the set # can not be resized during iteration. to_remove.add(fn_call) # add the function and its free variables to the # current step as well. This is useful in case # *another* function will call this one (`fn_call`) in # a child step. In this way we can track the calls up # to the last free variable. (refer to test # `test_dependencies_detection_recursive`) fns_free_variables[fn_call] = anc_fns_free_vars[ fn_call] fn_calls.difference_update(to_remove) # Add to ancestor the new outs annotations. First merge the # current outs present in the anc with the new ones anc_step.outs.update(outs) step.ins = sorted(ins) step.parameters = parameters step.fns_free_variables = fns_free_variables
def dependencies_detection(nb_graph: nx.DiGraph, pipeline_parameters: dict = None, imports_and_functions: str = ""): """Detect the data dependencies between nodes in the graph. The data dependencies detection algorithm roughly works as follows: 1. Traversing the graph in topological order, for every node `step` do 2. Detect the `ins` of current `step` by running PyFlakes on the source code. During this action the pipeline parameters are taken into consideration 3. Parse `step`'s global function definitions to get free variables (i.e. variables that would need to be marshalled in other steps that call these functions) - in this action pipeline parameters are taken into consideration. 4. Get all the function that `step` calls 5. For every `step`'s ancestor `anc` do - Get all the potential names (objects, functions, ...) of `anc` that could be marshalled (saved) - Intersect this with the `step`'s `ins` (from action 2) and add the result to `anc`'s `outs`. - for every `step`'s function call (action 4), check if this function was defined in `anc` and if it has free variables (action 3). If so, add to `step`'s `ins` and to `anc`'s `outs` these free variables. Args: nb_graph: nx DiGraph with pipeline code blocks pipeline_parameters: Pipeline parameters dict imports_and_functions: Multiline Python source that is prepended to every pipeline step Returns: annotated graph """ # resolve the data dependencies between steps, looping through the graph for step in nx.topological_sort(nb_graph): step_data = nb_graph.nodes(data=True)[step] # detect the INS dependencies of the CURRENT node---------------------- step_source_code = '\n'.join(step_data['source']) # get the variables that this step is missing and the pipeline # parameters that it actually needs. ins, parameters = detect_in_dependencies( source_code=step_source_code, pipeline_parameters=pipeline_parameters) fns_free_variables = detect_fns_free_variables(step_source_code, imports_and_functions, pipeline_parameters) # Get all the function calls. This will be used below to check if any # of the ancestors declare any of these functions. Is that is so, the # free variables of those functions will have to be loaded. fn_calls = kale_ast.get_function_calls(step_source_code) # add OUT dependencies annotations in the PARENT nodes----------------- # Intersect the missing names of this father's child with all # the father's names. The intersection is the list of variables # that the father need to serialize # The ancestors are the the nodes that have a path to `step`, ordered # by path length. ins_left = ins.copy() for anc in (graphutils.get_ordered_ancestors(nb_graph, step)): if not ins_left: # if there are no more variables that need to be marshalled, # stop the graph traverse break anc_data = nb_graph.nodes(data=True)[anc] anc_source = '\n'.join(anc_data['source']) # get all the marshal candidates from father's source and intersect # with the required names of the current node marshal_candidates = kale_ast.get_marshal_candidates(anc_source) outs = ins_left.intersection(marshal_candidates) # Remove the ins that have already been assigned to an ancestor. ins_left.difference_update(outs) # Include free variables to_remove = set() for fn_call in fn_calls: anc_fns_free_vars = anc_data.get("fns_free_variables", {}) if fn_call in anc_fns_free_vars.keys(): # the current step needs to load these variables fn_free_vars, used_params = anc_fns_free_vars[fn_call] # search if this function calls other functions (i.e. if # its free variables are found in the free variables dict) _left = list(fn_free_vars) while _left: _cur = _left.pop(0) # if the free var is itself a fn with free vars if _cur in anc_fns_free_vars: fn_free_vars.update(anc_fns_free_vars[_cur][0]) _left = _left + list(anc_fns_free_vars[_cur][0]) ins.update(fn_free_vars) # the current ancestor needs to save these variables outs.update(fn_free_vars) # add the parameters used by the function to the list # of pipeline parameters used by the step for param in used_params: parameters[param] = pipeline_parameters[param] # Remove this function as it has been served. We don't want # other ancestors to save free variables for this function. # Using the helper to_remove because the set can not be # resized during iteration. to_remove.add(fn_call) # add the function and its free variables to the current # step as well. This is useful in case *another* function # will call this one (`fn_call`) in a child step. In this # way we can track the calls up to the last free variable. # (refer to test `test_dependencies_detection_recursive`) fns_free_variables[fn_call] = anc_fns_free_vars[fn_call] fn_calls.difference_update(to_remove) # Add to ancestor the new outs annotations. First merge the current # outs present in the anc with the new ones outs.update(anc_data.get('outs', [])) nx.set_node_attributes(nb_graph, {anc: {'outs': sorted(outs)}}) new_data = { 'ins': sorted(ins), 'fns_free_variables': fns_free_variables, 'parameters': parameters } nx.set_node_attributes(nb_graph, {step: new_data})