def generate_lightweight_component(self, step: Step): """Generate Python code using the function template.""" step_source_raw = step.source def _encode_source(s): # Encode line by line a multiline string return "\n".join([ line.encode("unicode_escape").decode("utf-8") for line in s.splitlines() ]) if self.pipeline.processor.id == "nb": # Since the code will be wrapped in triple quotes inside the # template, we need to escape triple quotes as they will not be # escaped by encode("unicode_escape"). step.source = [ re.sub(r"'''", "\\'\\'\\'", _encode_source(s)) for s in step_source_raw ] _template_filename = PIPELINE_ORIGIN.get(self.pipeline.processor.id) template = self._get_templating_env().get_template(_template_filename) fn_code = template.render(step=step, **self.pipeline.config.to_dict()) # fix code style using pep8 guidelines return autopep8.fix_code(fn_code)
def parse_notebook(self): """Creates a NetworkX graph based on the input notebook's tags. Cell's source code are embedded into the graph as node attributes. """ # will be assigned at the end of each for loop prev_step_name = None # All the code cells that have to be pre-pended to every pipeline step # (i.e., imports and functions) are merged here imports_block = list() functions_block = list() # Variables that will become pipeline parameters pipeline_parameters = list() # Variables that will become pipeline metrics pipeline_metrics = list() for c in self.notebook.cells: if c.cell_type != "code": continue tags = self.parse_cell_metadata(c.metadata) if len(tags['step_names']) > 1: raise NotImplementedError("Kale does not yet support multiple" " step names in a single notebook" " cell. One notebook cell was found" " with %s step names" % tags['step_names']) step_name = (tags['step_names'][0] if 0 < len(tags['step_names']) else None) if step_name == 'skip': # when the cell is skipped, don't store `skip` as the previous # active cell continue if step_name == 'pipeline-parameters': pipeline_parameters.append(c.source) prev_step_name = step_name continue if step_name == 'imports': imports_block.append(c.source) prev_step_name = step_name continue if step_name == 'functions': functions_block.append(c.source) prev_step_name = step_name continue if step_name == 'pipeline-metrics': pipeline_metrics.append(c.source) prev_step_name = step_name continue # if none of the above apply, then we are parsing a code cell with # a block names and (possibly) some dependencies # if the cell was not tagged with a step name, # add the code to the previous cell if not step_name: if prev_step_name == 'imports': imports_block.append(c.source) elif prev_step_name == 'functions': functions_block.append(c.source) elif prev_step_name == 'pipeline-parameters': pipeline_parameters.append(c.source) elif prev_step_name == 'pipeline-metrics': pipeline_metrics.append(c.source) # current_block might be None in case the first cells of the # notebooks have not been tagged. elif prev_step_name: # this notebook cell will be merged to a previous one that # specified a step name self.pipeline.get_step(prev_step_name).merge_code(c.source) else: # in this branch we are sure that we are reading a code cell # with a step tag, so we must not allow for pipeline-metrics if prev_step_name == 'pipeline-metrics': raise ValueError("Tag pipeline-metrics must be placed on a" " cell at the end of the Notebook." " Pipeline metrics should be considered" " as a result of the pipeline execution" " and not of single steps.") # add node to DAG, adding tags and source code of notebook cell if step_name not in self.pipeline.nodes: step = Step(name=step_name, source=[c.source], ins=set(), outs=set(), limits=tags.get("limits", {}), labels=tags.get("labels", {}), annotations=tags.get("annotations", {})) self.pipeline.add_step(step) for _prev_step in tags['prev_steps']: if _prev_step not in self.pipeline.nodes: raise ValueError("Step %s does not exist. It was " "defined as previous step of %s" % (_prev_step, tags['step_names'])) self.pipeline.add_edge(_prev_step, step_name) else: self.pipeline.get_step(step_name).merge_code(c.source) prev_step_name = step_name # Prepend any `imports` and `functions` cells to every Pipeline step for step in self.pipeline.steps: step.source = imports_block + functions_block + step.source # merge together pipeline parameters pipeline_parameters = '\n'.join(pipeline_parameters) # merge together pipeline metrics pipeline_metrics = '\n'.join(pipeline_metrics) imports_and_functions = "\n".join(imports_block + functions_block) return pipeline_parameters, pipeline_metrics, imports_and_functions