def setUp(self): self.mock_component_run = ComponentRun("mock_component_run") self.mock_component_run_dict = { "component_name": "mock_component_run", "notes": "", "inputs": [], "outputs": [], "git_hash": None, "git_tags": None, "code_snapshot": None, "start_timestamp": None, "end_timestamp": None, "dependencies": [], "id": None, "stale": [], "test_result": None, "mlflow_run_id": None, "mlflow_run_params": None, "mlflow_run_metrics": None, } self.mock_inputs = [ IOPointer("mock_input_1"), IOPointer("mock_input_2"), ] self.mock_outputs = [ IOPointer("mock_output_1"), IOPointer("mock_output_2"), ]
def get_history(component_name: str, limit: int = 10) -> typing.List[ComponentRun]: """Returns a list of ComponentRuns that are part of the component's history.""" store = Store(_db_uri) history = store.get_history(component_name, limit) # Convert to client-facing ComponentRuns component_runs = [] for cr in history: inputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.inputs ] outputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.outputs ] dependencies = [dep.component_name for dep in cr.dependencies] d = copy.deepcopy(cr.__dict__) d.update({ "inputs": inputs, "outputs": outputs, "dependencies": dependencies }) component_runs.append(ComponentRun.from_dictionary(d)) return component_runs
def setUp(self): self.mock_component_run = ComponentRun("mock_component_run") self.mock_component_run_dict = { "component_name": "mock_component_run", "inputs": [], "outputs": [], "git_hash": None, "code_snapshot": None, "start_timestamp": None, "end_timestamp": None, "dependencies": [], "id": None, "stale": [], } self.mock_inputs = [IOPointer("mock_input_1"), IOPointer("mock_input_2")] self.mock_outputs = [IOPointer("mock_output_1"), IOPointer("mock_output_2")]
def log_component_run( component_run: ComponentRun, set_dependencies_from_inputs=True, staleness_threshold: int = (60 * 60 * 24 * 30), ): """Takes client-facing ComponentRun object and logs it to the DB.""" store = Store(_db_uri) # Make dictionary object component_run_dict = component_run.to_dictionary() component_run_sql = store.initialize_empty_component_run( component_run.component_name) # Add relevant attributes if component_run_dict["start_timestamp"]: component_run_sql.set_start_timestamp( component_run_dict["start_timestamp"]) if component_run_dict["end_timestamp"]: component_run_sql.set_end_timestamp( component_run_dict["end_timestamp"]) if component_run_dict["notes"]: component_run_sql.add_notes(component_run_dict["notes"]) component_run_sql.set_git_hash(component_run_dict["git_hash"]) component_run_sql.set_git_tags(component_run_dict["git_tags"]) component_run_sql.set_code_snapshot(component_run_dict["code_snapshot"]) # Add I/O component_run_sql.add_inputs([ store.get_io_pointer(inp.name, inp.value, pointer_type=inp.pointer_type) for inp in component_run_dict["inputs"] ]) component_run_sql.add_outputs([ store.get_io_pointer(out.name, out.value, pointer_type=out.pointer_type) for out in component_run_dict["outputs"] ]) # Create component if it does not exist create_component(component_run.component_name, "", "") # Add dependencies if there is flag to automatically set if set_dependencies_from_inputs: store.set_dependencies_from_inputs(component_run_sql) # Add dependencies explicitly stored in the component run for dependency in component_run_dict["dependencies"]: cr = store.get_history(dependency, 1)[0] component_run_sql.set_upstream(cr) store.commit_component_run(component_run_sql, staleness_threshold=staleness_threshold)
def testLogEmptyComponentRun(self): # Create component then log a run of it create_component("test_component", "test_description", "shreya") # Create a ComponentRun cr = ComponentRun("test_component") with self.assertRaises(RuntimeError): log_component_run(cr)
def testLogKVComponentRun(self): # Tests implementation of values in iopointer create_component( name="valtest", description="Tests implementation of values in iopointer.", owner="me", ) iop1 = ["this", "is", "the", "first"] iop2 = ["this", "is", "the", "second"] # Create iopointers and CR iop1 = IOPointer(name="iop1", value=iop1) iop2 = IOPointer(name="iop2", value=iop2) cr = ComponentRun("valtest") cr.set_start_timestamp() cr.set_end_timestamp() cr.add_input(iop1) cr.add_output(iop2) log_component_run(cr)
def backtrace(output_pointer: str): """Prints trace for an output id. Returns list of tuples (level, ComponentRun) where level is how many hops away the node is from the node that produced the output_id.""" store = Store(_db_uri) trace = store.trace(output_pointer) # Convert to entities.ComponentRun component_runs = [] for depth, cr in trace: inputs = [IOPointer.from_dictionary(iop.__dict__) for iop in cr.inputs] outputs = [IOPointer.from_dictionary(iop.__dict__) for iop in cr.outputs] dependencies = [dep.component_name for dep in cr.dependencies] d = copy.deepcopy(cr.__dict__) d.update({"inputs": inputs, "outputs": outputs, "dependencies": dependencies}) component_runs.append((depth, ComponentRun.from_dictionary(d))) return component_runs
def get_component_run_information(component_run_id: str) -> ComponentRun: """Returns a ComponentRun object.""" store = Store(_db_uri) cr = store.get_component_run(component_run_id) if not cr: raise RuntimeError(f"Component run with id {id} not found.") inputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.inputs ] outputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.outputs ] dependencies = [dep.component_name for dep in cr.dependencies] d = copy.deepcopy(cr.__dict__) if cr.code_snapshot: d.update({"code_snapshot": str(cr.code_snapshot.decode("utf-8"))}) d.update({"inputs": inputs, "outputs": outputs, "dependencies": dependencies}) return ComponentRun.from_dictionary(d)
def get_history( component_name: str, limit: int = 10, date_lower: typing.Union[datetime, str] = datetime.min, date_upper: typing.Union[datetime, str] = datetime.max, ) -> typing.List[ComponentRun]: """Returns a list of ComponentRuns that are part of the component's history.""" store = Store(_db_uri) # Check if none if not date_lower: date_lower = datetime.min if not date_upper: date_upper = datetime.max history = store.get_history(component_name, limit, date_lower, date_upper) # Convert to client-facing ComponentRuns component_runs = [] for cr in history: inputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.inputs ] outputs = [ IOPointer.from_dictionary(iop.__dict__).to_dictionary() for iop in cr.outputs ] dependencies = [dep.component_name for dep in cr.dependencies] d = copy.deepcopy(cr.__dict__) d.update({ "inputs": inputs, "outputs": outputs, "dependencies": dependencies, }) component_runs.append(ComponentRun.from_dictionary(d)) return component_runs
class TestComponentRun(unittest.TestCase): def setUp(self): self.mock_component_run = ComponentRun("mock_component_run") self.mock_component_run_dict = { "component_name": "mock_component_run", "notes": "", "inputs": [], "outputs": [], "git_hash": None, "git_tags": None, "code_snapshot": None, "start_timestamp": None, "end_timestamp": None, "dependencies": [], "id": None, "stale": [], "test_result": None, "mlflow_run_id": None, "mlflow_run_params": None, "mlflow_run_metrics": None, } self.mock_inputs = [ IOPointer("mock_input_1"), IOPointer("mock_input_2"), ] self.mock_outputs = [ IOPointer("mock_output_1"), IOPointer("mock_output_2"), ] def testSerialize(self): """ Test the serialization functionality. """ self.assertEqual( self.mock_component_run.to_dictionary(), self.mock_component_run_dict, ) def testSetStartEndError(self): """ Test that setting start and end ts as non datetime types throws an error. """ with self.assertRaises(TypeError): self.mock_component_run.set_start_timestamp("incorrect_type") with self.assertRaises(TypeError): self.mock_component_run.set_end_timestamp("incorrect_type") def testAddInputOutput(self): cr = copy.deepcopy(self.mock_component_run) for inp in self.mock_inputs: cr.add_input(inp) for out in self.mock_outputs: cr.add_output(out) self.assertEqual(cr.inputs, list(set(self.mock_inputs))) self.assertEqual(cr.outputs, list(set(self.mock_outputs))) def testAddInputsOutputs(self): cr = copy.deepcopy(self.mock_component_run) cr.add_inputs(self.mock_inputs) cr.add_outputs(self.mock_outputs) self.assertEqual(cr.inputs, list(set(self.mock_inputs))) self.assertEqual(cr.outputs, list(set(self.mock_outputs))) def testAddDuplicateInputs(self): cr = copy.deepcopy(self.mock_component_run) cr.add_inputs(self.mock_inputs) cr.add_inputs(self.mock_inputs) self.assertEqual(cr.inputs, list(set(self.mock_inputs))) def testAddNotes(self): cr = copy.deepcopy(self.mock_component_run) expected_output = "this is a test note" cr.notes = "this is a test note" self.assertEqual(cr.notes, expected_output) def testAddNotesError(self): """ Test that adding non-str input to the notes attribute gives a TypeError """ with self.assertRaises(TypeError): self.mock_component_run.notes = ["incorrect_type"]
def inference(model_files) -> str: identifier = "".join( random.choice(string.ascii_lowercase) for i in range(10)) return identifier if __name__ == "__main__": # Run training once version = "0" first_model_file = training(version) # Fake a component run from 2 months ago now = datetime.utcnow() cr = ComponentRun( "some_old_component", start_timestamp=now.replace(month=now.month - 2), end_timestamp=now, ) second_model_file = "model_1" cr.add_input("1") cr.add_output(second_model_file) log_component_run(cr) # Run training again version = "2" third_model_file = training(version) # Run inference on old model file. This should be stale! first_identifier = inference([first_model_file, second_model_file]) print(first_identifier)
class TestComponentRun(unittest.TestCase): def setUp(self): self.mock_component_run = ComponentRun("mock_component_run") self.mock_component_run_dict = { "component_name": "mock_component_run", "inputs": [], "outputs": [], "git_hash": None, "code_snapshot": None, "start_timestamp": None, "end_timestamp": None, "dependencies": [], "id": None, "stale": [], } self.mock_inputs = [IOPointer("mock_input_1"), IOPointer("mock_input_2")] self.mock_outputs = [IOPointer("mock_output_1"), IOPointer("mock_output_2")] def testSerialize(self): """ Test the serialization functionality. """ self.assertEqual( self.mock_component_run.to_dictionary(), self.mock_component_run_dict ) def testSetStartEndError(self): """ Test that setting start and end ts as non datetime types throws an error. """ with self.assertRaises(TypeError): self.mock_component_run.set_start_timestamp("incorrect_type") with self.assertRaises(TypeError): self.mock_component_run.set_end_timestamp("incorrect_type") def testAddInputOutput(self): cr = copy.deepcopy(self.mock_component_run) for inp in self.mock_inputs: cr.add_input(inp) for out in self.mock_outputs: cr.add_output(out) self.assertEqual(cr.inputs, list(set(self.mock_inputs))) self.assertEqual(cr.outputs, list(set(self.mock_outputs))) def testAddInputsOutputs(self): cr = copy.deepcopy(self.mock_component_run) cr.add_inputs(self.mock_inputs) cr.add_outputs(self.mock_outputs) self.assertEqual(cr.inputs, list(set(self.mock_inputs))) self.assertEqual(cr.outputs, list(set(self.mock_outputs))) def testAddDuplicateInputs(self): cr = copy.deepcopy(self.mock_component_run) cr.add_inputs(self.mock_inputs) cr.add_inputs(self.mock_inputs) self.assertEqual(cr.inputs, list(set(self.mock_inputs)))
def testLogBasicComponentRun(self): # Create component then log a run of it create_component("test_component", "test_description", "shreya") # Create a ComponentRun cr = ComponentRun(component_name="test_component") cr.set_start_timestamp() cr.code_snapshot = b"def main(): return" cr.add_inputs(["duplicate_input", "duplicate_input"]) cr.add_outputs(["duplicate_output", "duplicate_output"]) cr.set_end_timestamp() # Log component run log_component_run(cr)