def log_component_run( component_run: ComponentRun, set_dependencies_from_inputs=True, staleness_threshold: int = (60 * 60 * 24 * 30), ): """Takes client-facing ComponentRun object and logs it to the DB.""" store = Store(_db_uri) # Make dictionary object component_run_dict = component_run.to_dictionary() component_run_sql = store.initialize_empty_component_run( component_run.component_name) # Add relevant attributes if component_run_dict["start_timestamp"]: component_run_sql.set_start_timestamp( component_run_dict["start_timestamp"]) if component_run_dict["end_timestamp"]: component_run_sql.set_end_timestamp( component_run_dict["end_timestamp"]) if component_run_dict["notes"]: component_run_sql.add_notes(component_run_dict["notes"]) component_run_sql.set_git_hash(component_run_dict["git_hash"]) component_run_sql.set_git_tags(component_run_dict["git_tags"]) component_run_sql.set_code_snapshot(component_run_dict["code_snapshot"]) # Add I/O component_run_sql.add_inputs([ store.get_io_pointer(inp.name, inp.value, pointer_type=inp.pointer_type) for inp in component_run_dict["inputs"] ]) component_run_sql.add_outputs([ store.get_io_pointer(out.name, out.value, pointer_type=out.pointer_type) for out in component_run_dict["outputs"] ]) # Create component if it does not exist create_component(component_run.component_name, "", "") # Add dependencies if there is flag to automatically set if set_dependencies_from_inputs: store.set_dependencies_from_inputs(component_run_sql) # Add dependencies explicitly stored in the component run for dependency in component_run_dict["dependencies"]: cr = store.get_history(dependency, 1)[0] component_run_sql.set_upstream(cr) store.commit_component_run(component_run_sql, staleness_threshold=staleness_threshold)
class TestDags(unittest.TestCase): def setUp(self): self.store = Store("test") def testLinkedList(self): # Create chain of component runs expected_result = [] num_runs = 10 for i in range(1, num_runs + 1): self.store.create_component(f"mock_component_{i}", "", "") inp = self.store.get_io_pointer(f"iop_{i}") out = self.store.get_io_pointer(f"iop_{i + 1}") cr = self.store.initialize_empty_component_run( f"mock_component_{i}") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) expected_result.append((num_runs - i, i)) # Reverse the expected result expected_result.reverse() # Trace the final output trace = self.store.trace("iop_11") level_id = [(level, cr.id) for level, cr in trace] self.assertEqual(expected_result, level_id) def testVersionedComputation(self): # Run the same computation many times self.store.create_component("mock_component", "", "") num_runs = 10 for i in range(1, num_runs + 1): inp = self.store.get_io_pointer("inp") out = self.store.get_io_pointer("out") cr = self.store.initialize_empty_component_run("mock_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace the out pointer. Only most recent run ID should show. trace = self.store.trace("out") self.assertEqual(len(trace), 1) self.assertEqual(trace[0][0], 0) self.assertEqual(trace[0][1].id, num_runs) def testTree(self): # Create a tree of component runs, 5 levels deep num_levels = 2 global cr_counter global iop_counter cr_counter = 1 iop_counter = 1 def create_tree(level, inp): if level == num_levels: return global cr_counter global iop_counter self.store.create_component(f"mock_component_{cr_counter}", "", "") cr = self.store.initialize_empty_component_run( f"mock_component_{cr_counter}") cr_counter += 1 cr.set_start_timestamp() cr.set_end_timestamp() # Create output pointers out1 = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 out2 = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 # Add and commit component run cr.add_input(inp) cr.add_outputs([out1, out2]) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Create left and right trees create_tree(level + 1, out1) create_tree(level + 1, out2) # Create first input pointer and tree of computation inp = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 create_tree(0, inp) # Grab last iop id and trace it last_iop_id = f"iop_{iop_counter - 1}" trace = self.store.trace(last_iop_id) level_id = [(level, cr.id) for level, cr in trace] self.assertEqual(level_id, [(0, 3), (1, 1)]) def testCycle(self): # Create cycle. Since dependencies are versioned, we shouldn't run # into problems. # Create io pointers and components iop1 = self.store.get_io_pointer("iop1") iop2 = self.store.get_io_pointer("iop2") self.store.create_component("component_1", "", "") self.store.create_component("component_2", "", "") # Create component runs cr = self.store.initialize_empty_component_run("component_1") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop2) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) cr = self.store.initialize_empty_component_run("component_2") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop2) cr.add_output(iop1) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace iop1 trace_1 = [(level, cr.id) for level, cr in self.store.trace("iop1")] trace_2 = [(level, cr.id) for level, cr in self.store.trace("iop2")] self.assertEqual(trace_1, [(0, 2), (1, 1)]) self.assertEqual(trace_2, [(0, 1)]) def testStaleUpdate(self): # Create computation with stale update. iop1 = self.store.get_io_pointer("iop1") iop2 = self.store.get_io_pointer("iop2") iop3 = self.store.get_io_pointer("iop3") iop4 = self.store.get_io_pointer("iop4") self.store.create_component("component_1", "", "") self.store.create_component("component_2", "", "") # Create first component cr = self.store.initialize_empty_component_run("component_1") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop2) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Create second component run cr = self.store.initialize_empty_component_run("component_1") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop3) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Create third component run that depends on the first (stale update) cr = self.store.initialize_empty_component_run("component_2") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop2) cr.add_output(iop4) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace iop4 trace = [(level, cr.id, cr.stale) for level, cr in self.store.trace("iop4")] res = [ ( 0, 3, [ "component_1 (ID 1) has 1 fresher run that began " + "before this component run started." ], ), (1, 1, []), ] self.assertEqual(trace, res) def testStaleTime(self): # Create computation with stale update. iop1 = self.store.get_io_pointer("iop1") iop2 = self.store.get_io_pointer("iop2") iop3 = self.store.get_io_pointer("iop3") self.store.create_component("component_1", "", "") self.store.create_component("component_2", "", "") now = datetime.utcnow() # Create first component cr = self.store.initialize_empty_component_run("component_1") start_month = now.month - 2 if now.month > 2 else (12 + now.month) - 2 start_year = now.year if now.month > 2 else now.year - 1 start_date = now.replace(month=start_month, year=start_year) cr.set_start_timestamp(start_date) cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop2) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Create second component run cr = self.store.initialize_empty_component_run("component_2") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop2) cr.add_output(iop3) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace trace = [(level, cr.id, cr.stale) for level, cr in self.store.trace("iop3")] res = [ ( 0, 2, [ "component_1 (ID 1) was run " + f"{(now - start_date).days} days" + " ago." ], ), (1, 1, []), ] self.assertEqual(trace, res)
class TestStore(unittest.TestCase): def setUp(self): self.store = Store("test") def testComponent(self): self.store.create_component("test_component", "test_description", "shreya") component = self.store.get_component("test_component") self.assertEqual(component.name, "test_component") # Retrieve components with owner components = self.store.get_components(owner="shreya") self.assertEqual(1, len(components)) def testCompleteComponentRun(self): # Create component self.store.create_component("test_component", "test_description", "shreya") # Create component run cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(IOPointer("inp")) cr.add_output(IOPointer("out")) self.store.commit_component_run(cr) # Test retrieval component_runs = self.store.get_history("test_component", limit=None) self.assertEqual(1, len(component_runs)) self.assertEqual(component_runs[0], cr) def testLogComponentRunWithoutComponentCreated(self): # Create a ComponentRun cr = self.store.initialize_empty_component_run("test_component_new") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(IOPointer("inp")) cr.add_output(IOPointer("out")) self.store.commit_component_run(cr) # Test retrieval component_runs = self.store.get_history("test_component_new", limit=None) self.assertEqual(1, len(component_runs)) self.assertEqual(component_runs[0], cr) def testIncompleteComponentRun(self): # Create component self.store.create_component("test_component", "test_description", "shreya") # Create incomplete component run cr = self.store.initialize_empty_component_run("test_component") with self.assertRaises(RuntimeError): self.store.commit_component_run(cr) def testTags(self): # Create component without tags self.store.create_component("test_component", "test_description", "shreya") # Add tags self.store.add_tags_to_component("test_component", ["tag1", "tag2"]) # Test retrieval component = self.store.get_component("test_component") tags = [t.name for t in component.tags] self.assertEqual(component.name, "test_component") self.assertEqual(set(tags), set(["tag1", "tag2"])) def testDuplicateTags(self): # Create component without tags self.store.create_component("test_component", "test_description", "shreya") # Add duplicate tags self.store.add_tags_to_component("test_component", ["tag1", "tag1"]) # Test retrieval component = self.store.get_component("test_component") tags = [t.name for t in component.tags] self.assertEqual(component.name, "test_component") self.assertEqual(tags, ["tag1"]) def testIOPointer(self): # Test there is no IOPointer with self.assertRaises(RuntimeError): self.store.get_io_pointer("iop", create=False) # Create IOPointer iop = self.store.get_io_pointer("iop") iop2 = self.store.get_io_pointer("iop") self.assertEqual(iop, iop2) def testIOPointers(self): # Create new IOPointers from scratch iop_names = [f"iop_{i}" for i in range(100)] iops = self.store.get_io_pointers(iop_names) iops2 = self.store.get_io_pointers(iop_names) self.assertEqual(set(iops), set(iops2)) def testKVIOPointer(self): iop_name = "name" iop_value = "value" iop = self.store.get_io_pointer(iop_name, iop_value) iop2 = self.store.get_io_pointer(iop_name, iop_value) self.assertEqual(iop, iop2) def testSetDependenciesFromInputs(self): # Create IO pointers inp = self.store.get_io_pointer("inp") out = self.store.get_io_pointer("out") another_out = self.store.get_io_pointer("another_out") # Create two component runs that have the same output self.store.create_component("test_component", "test_description", "shreya") for idx in range(2): cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.commit_component_run(cr) # Create another two component runs that have the same output self.store.create_component("test_component", "test_description", "shreya") for idx in range(2): cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(another_out) self.store.commit_component_run(cr) # Create new component run that depends on "out" pointer cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_inputs([out, another_out]) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Retrieve latest component run and check dependencies component_runs = self.store.get_history("test_component", limit=None) self.assertTrue(component_runs[1] in component_runs[0].dependencies) self.assertTrue(component_runs[3] in component_runs[0].dependencies) def _set_up_computation(self): # Create dag of computation # Create component and IOPointers for i in range(1, 5): self.store.create_component(f"test_component_{i}", "test_description", "shreya") iop = [self.store.get_io_pointer(f"iop_{i}") for i in range(1, 5)] # Create component runs cr1 = self.store.initialize_empty_component_run("test_component_1") cr1.set_start_timestamp() cr1.set_end_timestamp() cr1.add_output(iop[0]) self.store.set_dependencies_from_inputs(cr1) self.store.commit_component_run(cr1) cr2 = self.store.initialize_empty_component_run("test_component_2") cr2.set_start_timestamp() cr2.set_end_timestamp() cr2.add_output(iop[0]) self.store.set_dependencies_from_inputs(cr2) self.store.commit_component_run(cr2) cr3 = self.store.initialize_empty_component_run("test_component_3") cr3.set_start_timestamp() cr3.set_end_timestamp() cr3.add_input(iop[0]) cr3.add_outputs([iop[1], iop[2]]) self.store.set_dependencies_from_inputs(cr3) self.store.commit_component_run(cr3) cr4 = self.store.initialize_empty_component_run("test_component_4") cr4.set_start_timestamp() cr4.set_end_timestamp() cr4.add_input(iop[2]) cr4.add_output(iop[3]) self.store.set_dependencies_from_inputs(cr4) self.store.commit_component_run(cr4) def testTrace(self): self._set_up_computation() # Call trace functionality trace = self.store.trace("iop_4") level_id = [(level, cr.id) for level, cr in trace] self.assertEqual(level_id, [(0, 4), (1, 3), (2, 2)]) def testEmptyTrace(self): with self.assertRaises(RuntimeError): self.store.trace("some_weird_pointer") with self.assertRaises(RuntimeError): self.store.web_trace("some_weird_pointer") def testWebTrace(self): self._set_up_computation() # Call web trace functionality. The ordering is nondeterministic. expected_res = [{ "id": "componentrun_4", "label": "test_component_4", "hasCaret": True, "isExpanded": True, "stale": [], "childNodes": [ { "id": "iopointer_iop_4", "label": "iop_4", "hasCaret": False, "parent": "componentrun_4", }, { "id": "componentrun_3", "label": "test_component_3", "hasCaret": True, "isExpanded": True, "stale": [], "childNodes": [ { "id": "iopointer_iop_2", "label": "iop_2", "hasCaret": False, "parent": "componentrun_3", }, { "id": "iopointer_iop_3", "label": "iop_3", "hasCaret": False, "parent": "componentrun_3", }, { "id": "componentrun_2", "label": "test_component_2", "hasCaret": True, "isExpanded": True, "stale": [], "childNodes": [{ "id": "iopointer_iop_1", "label": "iop_1", "hasCaret": False, "parent": "componentrun_2", }], }, ], }, ], }] web_trace = self.store.web_trace("iop_4") self.assertEqual(web_trace, expected_res) def testBasicFlaggedOutputs(self): # Create components and iopointers self.store.create_component("test_component_A", "test_description", "shreya") self.store.create_component("test_component_B", "test_description", "shreya") iop = [self.store.get_io_pointer(f"iop_{i}") for i in range(1, 5)] # Create component runs # First pipeline cr_A1 = self.store.initialize_empty_component_run("test_component_A") cr_A1.set_start_timestamp() cr_A1.set_end_timestamp() cr_A1.add_outputs([iop[0], iop[1]]) self.store.set_dependencies_from_inputs(cr_A1) self.store.commit_component_run(cr_A1) cr_B1 = self.store.initialize_empty_component_run("test_component_B") cr_B1.set_start_timestamp() cr_B1.set_end_timestamp() cr_B1.add_input(iop[0]) cr_B1.add_output(iop[2]) self.store.set_dependencies_from_inputs(cr_B1) self.store.commit_component_run(cr_B1) # Second pipeline, which builds off iop2 cr_B2 = self.store.initialize_empty_component_run("test_component_B") cr_B2.set_start_timestamp() cr_B2.set_end_timestamp() cr_B2.add_input(iop[1]) cr_B2.add_output(iop[3]) self.store.set_dependencies_from_inputs(cr_B2) self.store.commit_component_run(cr_B2) # Flag iop_3 and iop_4 self.store.set_io_pointer_flag("iop_3", True) self.store.set_io_pointer_flag("iop_4", True) # Run diagnose. It should output # [component_A, component_B, component_B]'s corresponding run IDs _, res = self.store.review_flagged_outputs() res = [(cr.id, count) for cr, count in res] expected_res = [(1, 2), (3, 1), (2, 1)] self.assertEqual(res, expected_res) def testManyFlaggedOutputs(self): # Create components and iopointers self.store.create_component("test_component_A", "test_description", "shreya") self.store.create_component("test_component_B", "test_description", "shreya") self.store.create_component("test_component_C", "test_description", "shreya") iop = [self.store.get_io_pointer(f"iop_{i}") for i in range(1, 8)] # Create component runs # First pipeline cr_A1 = self.store.initialize_empty_component_run("test_component_A") cr_A1.set_start_timestamp() cr_A1.set_end_timestamp() cr_A1.add_outputs([iop[0], iop[1]]) self.store.set_dependencies_from_inputs(cr_A1) self.store.commit_component_run(cr_A1) cr_B1 = self.store.initialize_empty_component_run("test_component_B") cr_B1.set_start_timestamp() cr_B1.set_end_timestamp() cr_B1.add_input(iop[0]) cr_B1.add_output(iop[2]) self.store.set_dependencies_from_inputs(cr_B1) self.store.commit_component_run(cr_B1) cr_C1 = self.store.initialize_empty_component_run("test_component_C") cr_C1.set_start_timestamp() cr_C1.set_end_timestamp() cr_C1.add_inputs([iop[1], iop[2]]) cr_C1.add_output(iop[3]) self.store.set_dependencies_from_inputs(cr_C1) self.store.commit_component_run(cr_C1) # Second pipeline cr_C2 = self.store.initialize_empty_component_run("test_component_C") cr_C2.set_start_timestamp() cr_C2.set_end_timestamp() cr_C2.add_inputs([iop[1], iop[2]]) cr_C2.add_output(iop[4]) self.store.set_dependencies_from_inputs(cr_C2) self.store.commit_component_run(cr_C2) # Third pipeline cr_C3 = self.store.initialize_empty_component_run("test_component_C") cr_C3.set_start_timestamp() cr_C3.set_end_timestamp() cr_C3.add_inputs([iop[1], iop[2]]) cr_C3.add_output(iop[5]) self.store.set_dependencies_from_inputs(cr_C3) self.store.commit_component_run(cr_C3) # Fourth pipeline cr_C4 = self.store.initialize_empty_component_run("test_component_C") cr_C4.set_start_timestamp() cr_C4.set_end_timestamp() cr_C4.add_inputs([iop[1], iop[2]]) cr_C4.add_output(iop[6]) self.store.set_dependencies_from_inputs(cr_C4) self.store.commit_component_run(cr_C4) # Flag self.store.set_io_pointer_flag("iop_4", True) self.store.set_io_pointer_flag("iop_5", True) self.store.set_io_pointer_flag("iop_6", True) self.store.set_io_pointer_flag("iop_7", True) _, res = self.store.review_flagged_outputs() res = [(cr.component_name, cr.id, count) for cr, count in res] expected_res = [ ("test_component_B", 2, 4), ("test_component_A", 1, 4), ("test_component_C", 6, 1), ("test_component_C", 5, 1), ("test_component_C", 4, 1), ("test_component_C", 3, 1), ] self.assertEqual(res, expected_res)
class TestDags(unittest.TestCase): def setUp(self): self.store = Store("test") def testLinkedList(self): # Create chain of component runs expected_result = [] num_runs = 10 for i in range(1, num_runs + 1): self.store.create_component(f"mock_component_{i}", "", "") inp = self.store.get_io_pointer(f"iop_{i}") out = self.store.get_io_pointer(f"iop_{i + 1}") cr = self.store.initialize_empty_component_run( f"mock_component_{i}") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) expected_result.append((num_runs - i, i)) # Reverse the expected result expected_result.reverse() # Trace the final output trace = self.store.trace("iop_11") level_id = [(l, cr.id) for l, cr in trace] self.assertEqual(expected_result, level_id) def testVersionedComputation(self): # Run the same computation many times self.store.create_component("mock_component", "", "") num_runs = 10 for i in range(1, num_runs + 1): inp = self.store.get_io_pointer("inp") out = self.store.get_io_pointer("out") cr = self.store.initialize_empty_component_run("mock_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace the out pointer. Only most recent run ID should show. trace = self.store.trace("out") self.assertEqual(len(trace), 1) self.assertEqual(trace[0][0], 0) self.assertEqual(trace[0][1].id, num_runs) def testTree(self): # Create a tree of component runs, 5 levels deep num_levels = 2 global cr_counter global iop_counter cr_counter = 1 iop_counter = 1 def create_tree(level, inp): if level == num_levels: return global cr_counter global iop_counter self.store.create_component(f"mock_component_{cr_counter}", "", "") cr = self.store.initialize_empty_component_run( f"mock_component_{cr_counter}") cr_counter += 1 cr.set_start_timestamp() cr.set_end_timestamp() # Create output pointers out1 = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 out2 = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 # Add and commit component run cr.add_input(inp) cr.add_outputs([out1, out2]) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Create left and right trees create_tree(level + 1, out1) create_tree(level + 1, out2) # Create first input pointer and tree of computation inp = self.store.get_io_pointer(f"iop_{iop_counter}") iop_counter += 1 create_tree(0, inp) # Grab last iop id and trace it last_iop_id = f"iop_{iop_counter - 1}" trace = self.store.trace(last_iop_id) level_id = [(l, cr.id) for l, cr in trace] self.assertEqual(level_id, [(0, 3), (1, 1)]) def testCycle(self): # Create cycle. Since dependencies are versioned, we shouldn't run into problems. # Create io pointers and components iop1 = self.store.get_io_pointer("iop1") iop2 = self.store.get_io_pointer("iop2") self.store.create_component("component_1", "", "") self.store.create_component("component_2", "", "") # Create component runs cr = self.store.initialize_empty_component_run("component_1") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop2) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) cr = self.store.initialize_empty_component_run("component_2") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop2) cr.add_output(iop1) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Trace iop1 trace_1 = [(l, cr.id) for l, cr in self.store.trace("iop1")] trace_2 = [(l, cr.id) for l, cr in self.store.trace("iop2")] self.assertEqual(trace_1, [(0, 2), (1, 1)]) self.assertEqual(trace_2, [(0, 1)])
def wrapper(*args, **kwargs): # Get function information filename = inspect.getfile(func) function_name = func.__name__ # Construct component run object store = Store(_db_uri) component_run = store.initialize_empty_component_run( component_name) component_run.set_start_timestamp() # Define trace helper def trace_helper(frame, event, arg): if event != "return": return logging.info(f"Inspecting {frame.f_code.co_filename}") input_pointers = [] output_pointers = [] local_vars = frame.f_locals # Add input_vars and output_vars as pointers for var in input_vars: if var not in local_vars: logging.debug( f"Variable {var} not in current stack frame.") continue val = local_vars[var] if val == None: logging.debug(f"Variable {var} has value {val}.") continue if isinstance(val, list): input_pointers += store.get_io_pointers(val) else: input_pointers.append(store.get_io_pointer(str(val))) for var in output_vars: if var not in local_vars: logging.debug( f"Variable {var} not in current stack frame.") continue val = local_vars[var] if val == None: logging.debug(f"Variable {var} has value {val}.") continue if isinstance(val, list): output_pointers += (store.get_io_pointers( val, PointerTypeEnum.ENDPOINT) if endpoint else store.get_io_pointers(val)) else: output_pointers += ([ store.get_io_pointer(str(val), PointerTypeEnum.ENDPOINT) ] if endpoint else [store.get_io_pointer(str(val))]) component_run.add_inputs(input_pointers) component_run.add_outputs(output_pointers) # Define tracer def tracer(frame, event, arg): if event == "call": if (frame.f_code.co_name == function_name and frame.f_code.co_filename == filename): return trace_helper return # Run function under the tracer sys.settrace(tracer) try: value = func(*args, **kwargs) finally: sys.settrace(None) # Log relevant info component_run.set_end_timestamp() input_pointers = [store.get_io_pointer(inp) for inp in inputs] output_pointers = ([ store.get_io_pointer(out, PointerTypeEnum.ENDPOINT) for out in outputs ] if endpoint else [store.get_io_pointer(out) for out in outputs]) component_run.add_inputs(input_pointers) component_run.add_outputs(output_pointers) store.set_dependencies_from_inputs(component_run) # Add code versions try: repo = git.Repo(search_parent_directories=True) component_run.set_git_hash(str(repo.head.object.hexsha)) except: logging.info("No git repo found.") # Add source code if less than 2^16 func_source_code = inspect.getsource(func) if len(func_source_code) < 2**16: component_run.set_code_snapshot( bytes(func_source_code, "ascii")) # Commit component run object to the DB store.commit_component_run(component_run) return value
class TestStore(unittest.TestCase): def setUp(self): self.store = Store("test") def testComponent(self): self.store.create_component("test_component", "test_description", "shreya") component = self.store.get_component("test_component") self.assertEqual(component.name, "test_component") # Retrieve components with owner components = self.store.get_components_with_owner("shreya") self.assertEqual(1, len(components)) def testCompleteComponentRun(self): # Create component self.store.create_component("test_component", "test_description", "shreya") # Create component run cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(IOPointer("inp")) cr.add_output(IOPointer("out")) self.store.commit_component_run(cr) # Test retrieval component_runs = self.store.get_history("test_component", limit=None) self.assertEqual(1, len(component_runs)) self.assertEqual(component_runs[0], cr) def testIncompleteComponentRun(self): # Create component self.store.create_component("test_component", "test_description", "shreya") # Create incomplete component run cr = self.store.initialize_empty_component_run("test_component") with self.assertRaises(RuntimeError): self.store.commit_component_run(cr) def testTags(self): # Create component without tags self.store.create_component("test_component", "test_description", "shreya") # Add tags self.store.add_tags_to_component("test_component", ["tag1", "tag2"]) # Test retrieval component = self.store.get_component("test_component") tags = [t.name for t in component.tags] self.assertEqual(component.name, "test_component") self.assertEqual(set(tags), set(["tag1", "tag2"])) def testDuplicateTags(self): # Create component without tags self.store.create_component("test_component", "test_description", "shreya") # Add duplicate tags self.store.add_tags_to_component("test_component", ["tag1", "tag1"]) # Test retrieval component = self.store.get_component("test_component") tags = [t.name for t in component.tags] self.assertEqual(component.name, "test_component") self.assertEqual(tags, ["tag1"]) def testIOPointer(self): # Test there is no IOPointer with self.assertRaises(RuntimeError): self.store.get_io_pointer("iop", create=False) # Create IOPointer iop = self.store.get_io_pointer("iop") iop2 = self.store.get_io_pointer("iop") self.assertEqual(iop, iop2) def testIOPointers(self): # Create new IOPointers from scratch iop_names = [f"iop_{i}" for i in range(100)] iops = self.store.get_io_pointers(iop_names) iops2 = self.store.get_io_pointers(iop_names) self.assertEqual(set(iops), set(iops2)) def testSetDependenciesFromInputs(self): # Create IO pointers inp = self.store.get_io_pointer("inp") out = self.store.get_io_pointer("out") another_out = self.store.get_io_pointer("another_out") # Create two component runs that have the same output self.store.create_component("test_component", "test_description", "shreya") for idx in range(2): cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(out) self.store.commit_component_run(cr) # Create another two component runs that have the same output self.store.create_component("test_component", "test_description", "shreya") for idx in range(2): cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(inp) cr.add_output(another_out) self.store.commit_component_run(cr) # Create new component run that depends on "out" pointer cr = self.store.initialize_empty_component_run("test_component") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_inputs([out, another_out]) self.store.set_dependencies_from_inputs(cr) self.store.commit_component_run(cr) # Retrieve latest component run and check dependencies component_runs = self.store.get_history("test_component", limit=None) self.assertTrue(component_runs[1] in component_runs[0].dependencies) self.assertTrue(component_runs[3] in component_runs[0].dependencies) def _set_up_computation(self): # Create dag of computation # Create component and IOPointers self.store.create_component("test_component", "test_description", "shreya") iop = [self.store.get_io_pointer(f"iop_{i}") for i in range(1, 5)] # Create component runs cr1 = self.store.initialize_empty_component_run("test_component") cr1.set_start_timestamp() cr1.set_end_timestamp() cr1.add_output(iop[0]) self.store.set_dependencies_from_inputs(cr1) self.store.commit_component_run(cr1) cr2 = self.store.initialize_empty_component_run("test_component") cr2.set_start_timestamp() cr2.set_end_timestamp() cr2.add_output(iop[0]) self.store.set_dependencies_from_inputs(cr2) self.store.commit_component_run(cr2) cr3 = self.store.initialize_empty_component_run("test_component") cr3.set_start_timestamp() cr3.set_end_timestamp() cr3.add_input(iop[0]) cr3.add_outputs([iop[1], iop[2]]) self.store.set_dependencies_from_inputs(cr3) self.store.commit_component_run(cr3) cr4 = self.store.initialize_empty_component_run("test_component") cr4.set_start_timestamp() cr4.set_end_timestamp() cr4.add_input(iop[2]) cr4.add_output(iop[3]) self.store.set_dependencies_from_inputs(cr4) self.store.commit_component_run(cr4) def testTrace(self): self._set_up_computation() # Call trace functionality trace = self.store.trace("iop_4") level_id = [(l, cr.id) for l, cr in trace] self.assertEqual(level_id, [(0, 4), (1, 3), (2, 2)]) def testEmptyTrace(self): with self.assertRaises(RuntimeError): self.store.trace("some_weird_pointer") with self.assertRaises(RuntimeError): self.store.web_trace("some_weird_pointer") def testWebTrace(self): self._set_up_computation() # Call web trace functionality. The ordering is nondeterministic. expected_res = [ { "id": "componentrun_4", "label": "test_component", "hasCaret": True, "isExpanded": True, "childNodes": [ { "id": "iopointer_iop_4", "label": "iop_4", "hasCaret": False, "parent": "componentrun_4", }, { "id": "componentrun_3", "label": "test_component", "hasCaret": True, "isExpanded": True, "childNodes": [ { "id": "iopointer_iop_2", "label": "iop_2", "hasCaret": False, "parent": "componentrun_3", }, { "id": "iopointer_iop_3", "label": "iop_3", "hasCaret": False, "parent": "componentrun_3", }, { "id": "componentrun_2", "label": "test_component", "hasCaret": True, "isExpanded": True, "childNodes": [ { "id": "iopointer_iop_1", "label": "iop_1", "hasCaret": False, "parent": "componentrun_2", } ], }, ], }, ], } ] web_trace = self.store.web_trace("iop_4") self.assertEqual(web_trace, expected_res)
def wrapper(*args, **kwargs): # Get function information filename = inspect.getfile(func) function_name = func.__name__ # Construct component run object store = Store(_db_uri) component_run = store.initialize_empty_component_run( component_name) component_run.set_start_timestamp() # Define trace helper frame = None trace = sys.gettrace() def trace_helper(_frame, event, arg): nonlocal frame if frame is None and event == "call": frame = _frame sys.settrace(trace) return trace # Run function under the tracer sys.settrace(trace_helper) try: # merge with existing run value = func(*args, **kwargs) finally: sys.settrace(trace) component_run.set_end_timestamp() # Do logging here logging.info(f"Inspecting {frame.f_code.co_filename}") input_pointers = [] output_pointers = [] local_vars = frame.f_locals # Auto log inputs if auto_log: # Get IOPointers corresponding to args and f_locals all_input_args = { k: v.default for k, v in inspect.signature(func).parameters.items() if v.default is not inspect.Parameter.empty } all_input_args = { **all_input_args, **dict(zip(inspect.getfullargspec(func).args, args)), } all_input_args = {**all_input_args, **kwargs} input_pointers += store.get_io_pointers_from_args( **all_input_args) # Add input_vars and output_vars as pointers for var in input_vars: if var not in local_vars: raise ValueError( f"Variable {var} not in current stack frame.") val = local_vars[var] if val is None: logging.debug(f"Variable {var} has value {val}.") continue if isinstance(val, list): input_pointers += store.get_io_pointers(val) else: input_pointers.append(store.get_io_pointer(str(val))) for var in output_vars: if var not in local_vars: raise ValueError( f"Variable {var} not in current stack frame.") val = local_vars[var] if val is None: logging.debug(f"Variable {var} has value {val}.") continue if isinstance(val, list): output_pointers += (store.get_io_pointers( val, pointer_type=PointerTypeEnum.ENDPOINT) if endpoint else store.get_io_pointers(val)) else: output_pointers += ([ store.get_io_pointer( str(val), pointer_type=PointerTypeEnum.ENDPOINT) ] if endpoint else [store.get_io_pointer(str(val))]) # Add input_kwargs and output_kwargs as pointers for key, val in input_kwargs.items(): if key not in local_vars or val not in local_vars: raise ValueError( f"Variables ({key}, {val}) not in current stack frame." ) if local_vars[key] is None: logging.debug( f"Variable {key} has value {local_vars[key]}.") continue if isinstance(local_vars[key], list): if not isinstance(local_vars[val], list) or len( local_vars[key]) != len(local_vars[val]): raise ValueError( f'Value "{val}" does not have the same length as' + f' the key "{key}."') input_pointers += store.get_io_pointers( local_vars[key], values=local_vars[val]) else: input_pointers.append( store.get_io_pointer(str(local_vars[key]), local_vars[val])) for key, val in output_kwargs.items(): if key not in local_vars or val not in local_vars: raise ValueError( f"Variables ({key}, {val}) not in current stack frame." ) if local_vars[key] is None: logging.debug( f"Variable {key} has value {local_vars[key]}.") continue if isinstance(local_vars[key], list): if not isinstance(local_vars[val], list) or len( local_vars[key]) != len(local_vars[val]): raise ValueError( f'Value "{val}" does not have the same length as' + f' the key "{key}."') output_pointers += (store.get_io_pointers( local_vars[key], local_vars[val], pointer_type=PointerTypeEnum.ENDPOINT, ) if endpoint else store.get_io_pointers( local_vars[key], local_vars[val])) else: output_pointers += ([ store.get_io_pointer( str(local_vars[key]), local_vars[val], pointer_type=PointerTypeEnum.ENDPOINT, ) ] if endpoint else [ store.get_io_pointer(str(local_vars[key]), local_vars[val]) ]) # Directly specified I/O if not callable(inputs): input_pointers += [store.get_io_pointer(inp) for inp in inputs] input_pointers += [store.get_io_pointer(inp) for inp in inputs] output_pointers += ([ store.get_io_pointer(out, pointer_type=PointerTypeEnum.ENDPOINT) for out in outputs ] if endpoint else [store.get_io_pointer(out) for out in outputs]) # If there were calls to mltrace.load and mltrace.save, log them if "_mltrace_loaded_artifacts" in local_vars: input_pointers += [ store.get_io_pointer(name, val) for name, val in local_vars["_mltrace_loaded_artifacts"].items() ] if "_mltrace_saved_artifacts" in local_vars: output_pointers += [ store.get_io_pointer(name, val) for name, val in local_vars["_mltrace_saved_artifacts"].items() ] func_source_code = inspect.getsource(func) if auto_log: # Get IOPointers corresponding to args and f_locals all_output_args = { k: v for k, v in local_vars.items() if k not in all_input_args } output_pointers += store.get_io_pointers_from_args( **all_output_args) component_run.add_inputs(input_pointers) component_run.add_outputs(output_pointers) # Add code versions try: repo = git.Repo(search_parent_directories=True) component_run.set_git_hash(str(repo.head.object.hexsha)) except Exception as e: logging.info("No git repo found.") # Add git tags if get_git_tags() is not None: component_run.set_git_tags(get_git_tags()) # Add source code if less than 2^16 if len(func_source_code) < 2**16: component_run.set_code_snapshot( bytes(func_source_code, "ascii")) # Create component if it does not exist create_component(component_run.component_name, "", "") store.set_dependencies_from_inputs(component_run) # Commit component run object to the DB store.commit_component_run(component_run, staleness_threshold=staleness_threshold) return value
def wrapper(*args, **kwargs): # Construct component run object store = Store(clientUtils.get_db_uri()) component_run = store.initialize_empty_component_run(self.name) # Assert key names are not in args or kwargs if (set(key_names) & set(inspect.getfullargspec(func).args) ) or (set(key_names) & set(kwargs.keys())): raise ValueError( "skip_before or skip_after cannot be in " + f"the arguments of the function {func.__name__}") # Make Dictionary of test status status = {} # Run before tests if not user_kwargs.get("skip_before"): all_args = dict( zip(inspect.getfullargspec(func).args, args)) all_args = { k if k not in inv_user_kwargs else inv_user_kwargs[k]: v for k, v in all_args.items() } all_args = {**all_args, **kwargs} status.update(self.beforeRun(**all_args)) # Create input and output pointers input_pointers = [] output_pointers = [] # Auto log inputs if auto_log: # Get IOPointers corresponding to args and f_locals all_input_args = { k: v.default for k, v in inspect.signature(func).parameters.items() if v.default is not inspect.Parameter.empty } all_input_args = { **all_input_args, **dict(zip(inspect.getfullargspec(func).args, args)), } all_input_args = {**all_input_args, **kwargs} input_pointers += store.get_io_pointers_from_args( should_filter=True, **all_input_args) def mlflow_start_run_id(): nonlocal mlflow_run_id res = mlflow_start_run_copy() if mlflow.active_run(): mlflow_run_id = mlflow.active_run().info.run_id return res # monkey patching mlflow.start_run method mlflow_run_id = None mlflow_start_run_copy = mlflow.start_run mlflow.start_run = mlflow_start_run_id component_run.set_start_timestamp() # Run function local_vars, value = utils.run_func_capture_locals( func, *args, **kwargs) component_run.set_end_timestamp() if mlflow_run_id is not None: try: mlflow_run = mlflow.get_run(mlflow_run_id) component_run.set_mlflow_run_id(mlflow_run_id) metrics = mlflow_run.data.metrics params = mlflow_run.data.params component_run.set_mlflow_run_metrics(metrics) component_run.set_mlflow_run_params(params) except Exception as e: logging.warning( f"Mlflow.get_run {mlflow_run_id} failed.") mlflow.start_run = mlflow_start_run_copy if not callable(input_vars): # Log input and output vars duplicate = input_vars if not isinstance(duplicate, dict): duplicate = {vname: None for vname in input_vars} for var, label_vars in duplicate.items(): if var not in local_vars: raise ValueError( f"Variable {var} not in current stack frame.") val = local_vars[var] labels = None if label_vars is not None: try: labels = ([ local_vars[lv] for lv in label_vars ] if isinstance(label_vars, list) else local_vars[label_vars]) if isinstance(labels, str): labels = [labels] except KeyError: raise ValueError( f"Variable {label_vars} not " + f"in current stack frame.") if val is None: logging.debug(f"Variable {var} has value {val}.") continue input_pointers += store.get_io_pointers_from_args( should_filter=False, labels=labels, **{var: val}) for var in output_vars: if var not in local_vars: raise ValueError( f"Variable {var} not in current stack frame.") val = local_vars[var] if val is None: logging.debug(f"Variable {var} has value {val}.") continue output_pointers += store.get_io_pointers_from_args( should_filter=False, **{var: val}) # If there were calls to mltrace.load and mltrace.save, log if "_mltrace_loaded_artifacts" in local_vars: input_pointers += [ store.get_io_pointer(name, val) for name, val in local_vars["_mltrace_loaded_artifacts"].items() ] if "_mltrace_saved_artifacts" in local_vars: output_pointers += [ store.get_io_pointer(name, val) for name, val in local_vars["_mltrace_saved_artifacts"].items() ] func_source_code = inspect.getsource(func) if auto_log: # Get IOPointers corresponding to args and f_locals all_output_args = { k: v for k, v in local_vars.items() if k not in all_input_args } output_pointers += store.get_io_pointers_from_args( should_filter=True, **all_output_args) # Check that none of the labels in the inputs are deleted store.assert_not_deleted_labels( input_pointers, staleness_threshold=staleness_threshold) # Propagate labels store.propagate_labels(input_pointers, output_pointers) component_run.add_inputs(input_pointers) component_run.add_outputs(output_pointers) # Add code versions try: repo = git.Repo(search_parent_directories=True) component_run.set_git_hash(str(repo.head.object.hexsha)) except Exception as e: logging.info("No git repo found.") # Add git tags if client.get_git_tags() is not None: component_run.set_git_tags(client.get_git_tags()) # Add source code if less than 2^16 if len(func_source_code) < 2**16: component_run.set_code_snapshot( bytes(func_source_code, "ascii")) # Create component if it does not exist client.create_component(self.name, self.description, self.owner, self.tags) # Set dependencies store.set_dependencies_from_inputs(component_run) # Perform after run tests if not user_kwargs.get("skip_after"): after_run_args = { k if k not in inv_user_kwargs else inv_user_kwargs[k]: v for k, v in local_vars.items() } status.update(self.afterRun(**after_run_args)) # update the component's testStatus, convert status to a json component_run.set_test_result(status) # Commit component run object to the DB store.commit_component_run( component_run, staleness_threshold=staleness_threshold) return value