def test_get_calls(): """Test that just function calls are detected.""" code = ''' a.obj() foo() ''' assert kale_ast.get_function_calls(code) == {'foo'} code = ''' x = 5 def foo(): print(x) ''' assert kale_ast.get_function_calls(code) == {'print'}
def dependencies_detection(self, imports_and_functions: str = ""): """Detect the data dependencies between nodes in the graph. The data dependencies detection algorithm roughly works as follows: 1. Traversing the graph in topological order, for every node `step` do 2. Detect the `ins` of current `step` by running PyFlakes on the source code. During this action the pipeline parameters are taken into consideration 3. Parse `step`'s global function definitions to get free variables (i.e. variables that would need to be marshalled in other steps that call these functions) - in this action pipeline parameters are taken into consideration. 4. Get all the function that `step` calls 5. For every `step`'s ancestor `anc` do - Get all the potential names (objects, functions, ...) of `anc` that could be marshalled (saved) - Intersect this with the `step`'s `ins` (from action 2) and add the result to `anc`'s `outs`. - for every `step`'s function call (action 4), check if this function was defined in `anc` and if it has free variables (action 3). If so, add to `step`'s `ins` and to `anc`'s `outs` these free variables. Args: imports_and_functions: Multiline Python source that is prepended to every pipeline step Returns: annotated graph """ # resolve the data dependencies between steps, looping through the # graph for step in self.pipeline.steps: # detect the INS dependencies of the CURRENT node------------------ step_source = '\n'.join(step.source) # get the variables that this step is missing and the pipeline # parameters that it actually needs. ins, parameters = self._detect_in_dependencies( source_code=step_source, pipeline_parameters=self.pipeline.pipeline_parameters) fns_free_variables = self._detect_fns_free_variables( step_source, imports_and_functions, self.pipeline.pipeline_parameters) # Get all the function calls. This will be used below to check if # any of the ancestors declare any of these functions. Is that is # so, the free variables of those functions will have to be loaded. fn_calls = astutils.get_function_calls(step_source) # add OUT dependencies annotations in the PARENT nodes------------- # Intersect the missing names of this father's child with all # the father's names. The intersection is the list of variables # that the father need to serialize # The ancestors are the the nodes that have a path to `step`, # ordered by path length. ins_left = ins.copy() for anc in (graphutils.get_ordered_ancestors( self.pipeline, step.name)): if not ins_left: # if there are no more variables that need to be # marshalled, stop the graph traverse break anc_step = self.pipeline.get_step(anc) anc_source = '\n'.join(anc_step.source) # get all the marshal candidates from father's source and # intersect with the required names of the current node marshal_candidates = astutils.get_marshal_candidates( anc_source) outs = ins_left.intersection(marshal_candidates) # Remove the ins that have already been assigned to an ancestor ins_left.difference_update(outs) # Include free variables to_remove = set() for fn_call in fn_calls: anc_fns_free_vars = anc_step.fns_free_variables if fn_call in anc_fns_free_vars.keys(): # the current step needs to load these variables fn_free_vars, used_params = anc_fns_free_vars[fn_call] # search if this function calls other functions (i.e. # if its free variables are found in the free variables # dict) _left = list(fn_free_vars) while _left: _cur = _left.pop(0) # if the free var is itself a fn with free vars if _cur in anc_fns_free_vars: fn_free_vars.update(anc_fns_free_vars[_cur][0]) _left = _left + list( anc_fns_free_vars[_cur][0]) ins.update(fn_free_vars) # the current ancestor needs to save these variables outs.update(fn_free_vars) # add the parameters used by the function to the list # of pipeline parameters used by the step _pps = self.pipeline.pipeline_parameters for param in used_params: parameters[param] = _pps[param] # Remove this function as it has been served. We don't # want other ancestors to save free variables for this # function. Using the helper to_remove because the set # can not be resized during iteration. to_remove.add(fn_call) # add the function and its free variables to the # current step as well. This is useful in case # *another* function will call this one (`fn_call`) in # a child step. In this way we can track the calls up # to the last free variable. (refer to test # `test_dependencies_detection_recursive`) fns_free_variables[fn_call] = anc_fns_free_vars[ fn_call] fn_calls.difference_update(to_remove) # Add to ancestor the new outs annotations. First merge the # current outs present in the anc with the new ones anc_step.outs.update(outs) step.ins = sorted(ins) step.parameters = parameters step.fns_free_variables = fns_free_variables