def test_dependencies_detection_inner_function(dummy_nb_config): """Test dependencies detection with inner functions.""" pipeline = Pipeline(dummy_nb_config) _source = ["x = 5"] pipeline.add_step(Step(name="step1", source=_source)) _source = [''' def foo(): def bar(x): print(x) bar(5) '''] pipeline.add_step(Step(name="step2", source=_source)) _source = [''' foo() print(x) '''] pipeline.add_step(Step(name="step3", source=_source)) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == ['x'] assert sorted(pipeline.get_step("step2").ins) == [] assert sorted(pipeline.get_step("step2").outs) == ['foo'] assert sorted(pipeline.get_step("step3").ins) == ['foo', 'x'] assert sorted(pipeline.get_step("step3").outs) == []
def test_dependencies_detection_recursive_different_steps(dummy_nb_config): """Test dependencies are detected even with a chain of functions calls.""" pipeline = Pipeline(dummy_nb_config) _source = [''' x = 5 def foo(): print(x) '''] pipeline.add_step(Step(name="step1", source=_source)) _source = [''' def bar(): foo() '''] pipeline.add_step(Step(name="step2", source=_source)) _source = ["bar()"] pipeline.add_step(Step(name="step3", source=_source)) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == ['foo', 'x'] assert sorted(pipeline.get_step("step2").ins) == ['foo', 'x'] assert sorted(pipeline.get_step("step2").outs) == ['bar', 'foo', 'x'] assert sorted(pipeline.get_step("step3").ins) == ['bar', 'foo', 'x'] assert sorted(pipeline.get_step("step3").outs) == []
def test_dependencies_detection_free_variable(dummy_nb_config): """Test dependencies detection with free variables.""" pipeline = Pipeline(dummy_nb_config) _source = [''' x = 5 '''] pipeline.add_step(Step(name="step1", source=_source)) _source = [''' def foo(): print(x) '''] pipeline.add_step(Step(name="step2", source=_source)) _source = [''' foo() '''] pipeline.add_step(Step(name="step3", source=_source)) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == ["x"] assert sorted(pipeline.get_step("step2").ins) == ["x"] assert sorted(pipeline.get_step("step2").outs) == ["foo", "x"] assert sorted(pipeline.get_step("step3").ins) == ["foo", "x"] assert sorted(pipeline.get_step("step3").outs) == []
def notebook_to_graph(self): # convert notebook to nx graph pipeline_graph, pipeline_parameters_source = parser.parse_notebook( self.notebook) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # run static analysis over the source code to_ignore = set(pipeline_parameters_dict.keys()) dependencies.dependencies_detection(pipeline_graph, ignore_symbols=to_ignore) # add an empty step at the end of the pipeline for final snapshot if self.auto_snapshot: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. leaf_steps = [ x for x in pipeline_graph.nodes() if pipeline_graph.out_degree(x) == 0 ] for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) data = {auto_snapshot_name: {'source': '', 'ins': [], 'outs': []}} nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict
def test_dependencies_detection_recursive_different_steps_triple(): """Test dependencies are detected even with a long chain of fns calls.""" imports_and_functions = "" pipeline_parameters = {} g = nx.DiGraph() # NODES g.add_node("step0", ins=list(), outs=list(), source=[ ''' %s x = 5 def init(): print(x) ''' % imports_and_functions ]) g.add_node( "step1", ins=list(), outs=list(), source=[''' %s def foo(): init() ''' % imports_and_functions]) g.add_node( "step2", ins=list(), outs=list(), source=[''' %s def bar(): foo() ''' % imports_and_functions]) g.add_node("step3", ins=list(), outs=list(), source=[''' %s bar() ''' % imports_and_functions]) # EDGES g.add_edge("step0", "step1") g.add_edge("step1", "step2") g.add_edge("step2", "step3") dependencies.dependencies_detection(g, pipeline_parameters, imports_and_functions) assert g.nodes(data=True)['step0']['ins'] == [] assert g.nodes(data=True)['step0']['outs'] == ['init', 'x'] assert g.nodes(data=True)['step1']['ins'] == ['init', 'x'] assert g.nodes(data=True)['step1']['outs'] == ['foo', 'init', 'x'] assert g.nodes(data=True)['step2']['ins'] == ['foo', 'init', 'x'] assert g.nodes(data=True)['step2']['outs'] == ['bar', 'foo', 'init', 'x'] assert g.nodes(data=True)['step3']['ins'] == ['bar', 'foo', 'init', 'x'] assert g.nodes(data=True)['step3']['outs'] == []
def test_dependencies_detection_with_try_except(): """Test dependencies are detected with functions inside try.""" imports_and_functions = "" pipeline_parameters = {} g = nx.DiGraph() # NODES g.add_node("step1", ins=list(), outs=list(), source=[''' %s x = 5 y = 6 ''' % imports_and_functions]) g.add_node("step2", ins=list(), outs=list(), source=[ ''' %s try: def foo(): print(x) def bar(): print(y) except: pass ''' % imports_and_functions ]) g.add_node("step3", ins=list(), outs=list(), source=[''' %s foo() bar() ''' % imports_and_functions]) # EDGES g.add_edge("step1", "step2") g.add_edge("step2", "step3") dependencies.dependencies_detection(g, pipeline_parameters, imports_and_functions) assert g.nodes(data=True)['step1']['ins'] == [] assert g.nodes(data=True)['step1']['outs'] == ['x', 'y'] assert g.nodes(data=True)['step2']['ins'] == ['x', 'y'] assert g.nodes(data=True)['step2']['outs'] == ['bar', 'foo', 'x', 'y'] assert g.nodes(data=True)['step3']['ins'] == ['bar', 'foo', 'x', 'y'] assert g.nodes(data=True)['step3']['outs'] == []
def notebook_to_graph(self): """Convert an annotated Notebook to a Graph.""" # convert notebook to nx graph (pipeline_graph, pipeline_parameters_source, pipeline_metrics_source, imports_and_functions) = parser.parse_notebook( self.notebook, self.pipeline_metadata) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # run static analysis over the source code dependencies.dependencies_detection( pipeline_graph, pipeline_parameters=pipeline_parameters_dict, imports_and_functions=imports_and_functions) dependencies.assign_metrics(pipeline_graph, pipeline_metrics) # if there are multiple DAG leaves, add an empty step at the end of the # pipeline for final snapshot leaf_steps = graphutils.get_leaf_nodes(pipeline_graph) if self.pipeline_metadata.get("autosnapshot") and len(leaf_steps) > 1: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) step_defaults = parser.parse_steps_defaults( self.pipeline_metadata.get("steps_defaults", [])) data = { auto_snapshot_name: { "source": "", "ins": [], "outs": [], "annotations": step_defaults.get("annotations"), "labels": step_defaults.get("labels"), "limits": step_defaults.get("limits") } } nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict
def test_deps_detection_recursive_different_steps_branch(dummy_nb_config): """Test dependencies when fns are passed from multiple branches.""" pipeline = Pipeline(dummy_nb_config) _source = [''' x = 5 y = 6 '''] pipeline.add_step(Step(name="step0", source=_source)) _source = [''' def foo(): print(x) '''] pipeline.add_step(Step(name="step_l", source=_source)) _source = [''' def bar(): print(y) '''] pipeline.add_step(Step(name="step_r", source=_source)) _source = [''' def result(): foo() bar() '''] pipeline.add_step(Step(name="step_m", source=_source)) _source = ["result()"] pipeline.add_step(Step(name="step_f", source=_source)) pipeline.add_edge("step0", "step_l") pipeline.add_edge("step0", "step_r") pipeline.add_edge("step_l", "step_m") pipeline.add_edge("step_r", "step_m") pipeline.add_edge("step_m", "step_f") dependencies.dependencies_detection(pipeline) assert sorted(pipeline.get_step("step0").ins) == [] assert sorted(pipeline.get_step("step0").outs) == ['x', 'y'] assert sorted(pipeline.get_step("step_l").ins) == ['x'] assert sorted(pipeline.get_step("step_l").outs) == ['foo', 'x'] assert sorted(pipeline.get_step("step_r").ins) == ['y'] assert sorted(pipeline.get_step("step_r").outs) == ['bar', 'y'] assert sorted(pipeline.get_step("step_m").ins) == ['bar', 'foo', 'x', 'y'] assert (sorted(pipeline.get_step("step_m").outs) == [ 'bar', 'foo', 'result', 'x', 'y' ]) assert (sorted( pipeline.get_step("step_f").ins) == ['bar', 'foo', 'result', 'x', 'y']) assert sorted(pipeline.get_step("step_f").outs) == []
def test_dependencies_detection_with_pipeline_parameters(): """Test dependencies are detected with pipeline parameters and globals.""" imports_and_functions = "import math" pipeline_parameters = {"y": (5, 'int')} g = nx.DiGraph() # NODES g.add_node("step1", ins=list(), outs=list(), source=[''' %s x = 5 ''' % imports_and_functions]) g.add_node("step2", ins=list(), outs=list(), source=[ ''' %s def foo(x): def bar(): math.sqrt(x + y) bar() ''' % imports_and_functions ]) g.add_node("step3", ins=list(), outs=list(), source=[''' %s foo(5) ''' % imports_and_functions]) # EDGES g.add_edge("step1", "step2") g.add_edge("step2", "step3") dependencies.dependencies_detection(g, pipeline_parameters, imports_and_functions) assert g.nodes(data=True)['step1']['ins'] == [] assert g.nodes(data=True)['step1']['outs'] == [] assert g.nodes(data=True)['step2']['ins'] == [] assert g.nodes(data=True)['step2']['outs'] == ['foo'] assert g.nodes(data=True)['step2']['parameters'] == {"y": (5, 'int')} assert g.nodes(data=True)['step3']['ins'] == ['foo'] assert g.nodes(data=True)['step3']['outs'] == [] assert g.nodes(data=True)['step3']['parameters'] == {"y": (5, 'int')}
def test_dependencies_detection_inner_function(): """Test dependencies detection with inner functions.""" imports_and_functions = "" pipeline_parameters = {} g = nx.DiGraph() # NODES g.add_node("step1", ins=list(), outs=list(), source=[''' %s x = 5 ''' % imports_and_functions]) g.add_node("step2", ins=list(), outs=list(), source=[ ''' %s def foo(): def bar(x): print(x) bar(5) ''' % imports_and_functions ]) g.add_node("step3", ins=list(), outs=list(), source=[''' %s foo() print(x) ''' % imports_and_functions]) # EDGES g.add_edge("step1", "step2") g.add_edge("step2", "step3") dependencies.dependencies_detection(g, pipeline_parameters, imports_and_functions) assert g.nodes(data=True)['step1']['ins'] == [] assert g.nodes(data=True)['step1']['outs'] == ['x'] assert g.nodes(data=True)['step2']['ins'] == [] assert g.nodes(data=True)['step2']['outs'] == ['foo'] assert g.nodes(data=True)['step3']['ins'] == ['foo', 'x'] assert g.nodes(data=True)['step3']['outs'] == []
def test_dependencies_detection_with_pipeline_parameters(dummy_nb_config): """Test dependencies are detected with pipeline parameters and globals.""" imports_and_functions = "import math" pipeline = Pipeline(dummy_nb_config) pipeline.pipeline_parameters = {"y": (5, 'int')} _source = ["x = 5"] pipeline.add_step( Step(name="step1", source=_prepend_to_source(_source, imports_and_functions))) _source = [ ''' def foo(x): def bar(): math.sqrt(x + y) bar() ''' ] pipeline.add_step( Step(name="step2", source=_prepend_to_source(_source, imports_and_functions))) _source = ["foo(5)"] pipeline.add_step( Step(name="step3", source=_prepend_to_source(_source, imports_and_functions))) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline, imports_and_functions) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == [] assert sorted(pipeline.get_step("step2").ins) == [] assert sorted(pipeline.get_step("step2").outs) == ['foo'] assert pipeline.get_step("step2").parameters == {"y": (5, 'int')} assert sorted(pipeline.get_step("step3").ins) == ['foo'] assert sorted(pipeline.get_step("step3").outs) == [] assert pipeline.get_step("step3").parameters == {"y": (5, 'int')}
def to_pipeline(self): """Convert an annotated Notebook to a Pipeline object.""" (pipeline_parameters_source, pipeline_metrics_source, imports_and_functions) = self.parse_notebook() self.parse_pipeline_parameters(pipeline_parameters_source) self.pipeline.set_volume_pipeline_parameters() # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # run static analysis over the source code dependencies.dependencies_detection( self.pipeline, imports_and_functions=imports_and_functions) dependencies.assign_metrics(self.pipeline, pipeline_metrics) # if there are multiple DAG leaves, add an empty step at the end of the # pipeline for final snapshot leaf_steps = self.pipeline.get_leaf_steps() if self.config.autosnapshot and len(leaf_steps) > 1: _name = "final_auto_snapshot" self.pipeline.add_step(Step(name=_name, source=[])) # add a link from all the last steps of the pipeline to # the final auto snapshot one. for step in leaf_steps: self.pipeline.add_edge(step.name, _name) # FIXME: Move this to a base class Processor, to be executed by default # after `to_pipeline`, so that it is agnostic to the type of # processor. for step in self.pipeline.steps: step.config.update(self.pipeline.config.steps_defaults) # TODO: Additional action required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return self.pipeline
def test_dependencies_detection_with_try_except(dummy_nb_config): """Test dependencies are detected with functions inside try.""" pipeline = Pipeline(dummy_nb_config) _source = [''' x = 5 y = 6 '''] pipeline.add_step(Step(name="step1", source=_source)) _source = [ ''' try: def foo(): print(x) def bar(): print(y) except: pass ''' ] pipeline.add_step(Step(name="step2", source=_source)) _source = [''' foo() bar() '''] pipeline.add_step(Step(name="step3", source=_source)) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == ['x', 'y'] assert sorted(pipeline.get_step("step2").ins) == ['x', 'y'] assert sorted(pipeline.get_step("step2").outs) == ['bar', 'foo', 'x', 'y'] assert sorted(pipeline.get_step("step3").ins) == ['bar', 'foo', 'x', 'y'] assert sorted(pipeline.get_step("step3").outs) == []
def test_dependencies_detection_with_globals(dummy_nb_config): """Test dependencies detection with inner function and globals.""" imports_and_functions = "import math" pipeline = Pipeline(dummy_nb_config) _source = ["x = 5"] pipeline.add_step( Step(name="step1", source=_prepend_to_source(_source, imports_and_functions))) _source = [ ''' def foo(x): def bar(): math.sqrt(x) bar() ''' ] pipeline.add_step( Step(name="step2", source=_prepend_to_source(_source, imports_and_functions))) _source = ["foo(5)"] pipeline.add_step( Step(name="step3", source=_prepend_to_source(_source, imports_and_functions))) pipeline.add_edge("step1", "step2") pipeline.add_edge("step2", "step3") dependencies.dependencies_detection(pipeline, imports_and_functions) assert sorted(pipeline.get_step("step1").ins) == [] assert sorted(pipeline.get_step("step1").outs) == [] assert sorted(pipeline.get_step("step2").ins) == [] assert sorted(pipeline.get_step("step2").outs) == ['foo'] assert sorted(pipeline.get_step("step3").ins) == ['foo'] assert sorted(pipeline.get_step("step3").outs) == []
def notebook_to_graph(self): """Convert an annotated Notebook to a Graph.""" # convert notebook to nx graph (pipeline_graph, pipeline_parameters_source, pipeline_metrics_source, imports_and_functions) = parser.parse_notebook(self.notebook) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # run static analysis over the source code dependencies.dependencies_detection( pipeline_graph, pipeline_parameters=pipeline_parameters_dict, imports_and_functions=imports_and_functions) dependencies.assign_metrics(pipeline_graph, pipeline_metrics) # add an empty step at the end of the pipeline for final snapshot if self.auto_snapshot: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. leaf_steps = graph_utils.get_leaf_nodes(pipeline_graph) for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) data = {auto_snapshot_name: {'source': '', 'ins': [], 'outs': []}} nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict
def notebook_to_graph(self): # convert notebook to nx graph (pipeline_graph, pipeline_parameters_source, pipeline_metrics_source) = parser.parse_notebook(self.notebook) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # if there are some pipeline metrics, create an additional step at the # end of the pipeline to log them. # By adding this step before dependencies detection, we make sure that # the necessary variables are marshalled at the beginning of the step. if len(pipeline_metrics): pipeline_metrics_name = "pipeline_metrics" # add a link from all the last steps of the pipeline to # the final auto snapshot one. leaf_steps = [ x for x in pipeline_graph.nodes() if pipeline_graph.out_degree(x) == 0 ] for node in leaf_steps: pipeline_graph.add_edge(node, pipeline_metrics_name) # generate the code that dumps the pipeline metrics to file template_env = _initialize_templating_env() metrics_template = template_env.get_template( 'pipeline_metrics_template.jinja2') # need to be a list since it will be treated as a code cell and # passed to the ipykernel metrics_source = [ metrics_template.render(pipeline_metrics=pipeline_metrics) ] data = { pipeline_metrics_name: { 'source': metrics_source, 'ins': [], 'outs': [] } } nx.set_node_attributes(pipeline_graph, data) # run static analysis over the source code dependencies.dependencies_detection( pipeline_graph, pipeline_parameters=pipeline_parameters_dict) # add an empty step at the end of the pipeline for final snapshot if self.auto_snapshot: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. leaf_steps = [ x for x in pipeline_graph.nodes() if pipeline_graph.out_degree(x) == 0 ] for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) data = {auto_snapshot_name: {'source': '', 'ins': [], 'outs': []}} nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict
def test_dependencies_detection_recursive_different_steps_branch(): """Test dependencies when fns are passed from multiple branches.""" imports_and_functions = "" pipeline_parameters = {} g = nx.DiGraph() # NODES g.add_node( "step0", ins=list(), outs=list(), source=[''' %s x = 5 y = 6 ''' % imports_and_functions]) g.add_node("stepL", ins=list(), outs=list(), source=[ ''' %s def foo(): print(x) ''' % imports_and_functions ]) g.add_node("stepR", ins=list(), outs=list(), source=[ ''' %s def bar(): print(y) ''' % imports_and_functions ]) g.add_node("stepM", ins=list(), outs=list(), source=[ ''' %s def result(): foo() bar() ''' % imports_and_functions ]) g.add_node("stepF", ins=list(), outs=list(), source=[''' %s result() ''' % imports_and_functions]) # EDGES g.add_edge("step0", "stepL") g.add_edge("step0", "stepR") g.add_edge("stepL", "stepM") g.add_edge("stepR", "stepM") g.add_edge("stepM", "stepF") dependencies.dependencies_detection(g, pipeline_parameters, imports_and_functions) assert g.nodes(data=True)['step0']['ins'] == [] assert g.nodes(data=True)['step0']['outs'] == ['x', 'y'] assert g.nodes(data=True)['stepL']['ins'] == ['x'] assert g.nodes(data=True)['stepL']['outs'] == ['foo', 'x'] assert g.nodes(data=True)['stepR']['ins'] == ['y'] assert g.nodes(data=True)['stepR']['outs'] == ['bar', 'y'] assert g.nodes(data=True)['stepM']['ins'] == ['bar', 'foo', 'x', 'y'] assert (g.nodes(data=True)['stepM']['outs'] == [ 'bar', 'foo', 'result', 'x', 'y' ]) assert (g.nodes(data=True)['stepF']['ins'] == [ 'bar', 'foo', 'result', 'x', 'y' ]) assert g.nodes(data=True)['stepF']['outs'] == []