def set_dependencies_from_inputs(self, component_run: ComponentRun): """Gets IOPointers associated with component_run's inputs, checks against any ComponentRun's outputs, and if there are any matches, sets the ComponentRun's dependency on the most recent match.""" input_ids = [inp.name for inp in component_run.inputs] if len(input_ids) == 0: return match_ids = (self.session.query( func.max(component_run_output_association.c.component_run_id), ).group_by(component_run_output_association.c.output_path_name).filter( component_run_output_association.c.output_path_name.in_( input_ids)).all()) match_ids = [m[0] for m in match_ids] matches = (self.session.query(ComponentRun).filter( ComponentRun.id.in_(match_ids)).all()) # If there are no matches, return if len(matches) == 0: return # Get match with the max timestamp and set upstream component_run.set_upstream(matches)
def commit_component_run( self, component_run: ComponentRun, staleness_threshold: int = (60 * 60 * 24 * 30), ): """Commits a fully initialized component run to the DB.""" status_dict = component_run.check_completeness() if not status_dict["success"]: raise RuntimeError(status_dict["msg"]) if status_dict["msg"]: logging.info(status_dict["msg"]) # Check for staleness for dep in component_run.dependencies: # First case: there is over a month between component runs time_diff = (component_run.start_timestamp - dep.start_timestamp).total_seconds() if time_diff > staleness_threshold: days_diff = int(time_diff // (60 * 60 * 24)) component_run.add_staleness_message( f"{dep.component_name} (ID {dep.id}) was run {days_diff}" + " days ago.") # Second case: there is a newer run of the dependency fresher_runs = self.get_history( dep.component_name, limit=None, date_lower=dep.start_timestamp, date_upper=component_run.start_timestamp, ) fresher_runs = [ cr for cr in fresher_runs if component_run.id != cr.id ] if len(fresher_runs) > 1: run_or_runs = "run" if len(fresher_runs) - 1 == 1 else "runs" component_run.add_staleness_message( f"{dep.component_name} (ID {dep.id}) has " + f"{len(fresher_runs) - 1} fresher {run_or_runs} that " + "began before this component run started.") # Warn user if there is a staleness message if len(component_run.stale) > 0: logging.warning(component_run.stale) # Dedup labels for inp in component_run.inputs: inp.dedup_labels() for out in component_run.outputs: out.dedup_labels() # Commit to DB self.session.add(component_run) logging.info(f"Committing ComponentRun {component_run.id} of type " + f'"{component_run.component_name}" to the database.') self.session.commit()
def commit_component_run( self, component_run: ComponentRun, staleness_threshold: int = (60 * 60 * 24 * 30), ): """Commits a fully initialized component run to the DB.""" status_dict = component_run.check_completeness() if not status_dict["success"]: raise RuntimeError(status_dict["msg"]) if status_dict["msg"]: logging.info(status_dict["msg"]) # Check for staleness. https://github.com/loglabs/mltrace/issues/165#issue-891397631 for dep in component_run.dependencies: # First case: there is over a month between component runs time_diff = (component_run.start_timestamp - dep.start_timestamp).total_seconds() if time_diff > staleness_threshold: days_diff = int(time_diff // (60 * 60 * 24)) component_run.add_staleness_message( f"{dep.component_name} (ID {dep.id}) was run {days_diff} days ago." ) # Second case: there is a newer run of the dependency fresher_runs = self.get_history( dep.component_name, limit=None, date_lower=dep.start_timestamp, date_upper=component_run.start_timestamp, ) if len(fresher_runs) != 1: component_run.add_staleness_message( f"{dep.component_name} (ID {dep.id}) has {len(fresher_runs) - 1} fresher run(s) that began before this component run started." ) # Commit to DB self.session.add(component_run) logging.info( f'Committing ComponentRun of type "{component_run.component_name}" to the database.' ) self.session.commit()
def commit_component_run(self, component_run: ComponentRun): """Commits a fully initialized component run to the DB.""" status_dict = component_run.check_completeness() if not status_dict["success"]: raise RuntimeError(status_dict["msg"]) if status_dict["msg"]: logging.info(status_dict["msg"]) # Commit to DB self.session.add(component_run) logging.info( f'Committing ComponentRun of type "{component_run.component_name}" to the database.' ) self.session.commit()
def testCompletenessWithStartEndDeps(self): cr = copy.deepcopy(self.mock_component_run) cr.set_start_timestamp() cr.set_end_timestamp() # Add I/O and dependencies cr.add_input(IOPointer("input")) cr.add_output(IOPointer("output")) cr.set_upstream(ComponentRun("mock_upstream")) status = cr.check_completeness() self.assertTrue(status["success"]) # Assert that there no warning statements msg = status["msg"] self.assertTrue(len(msg) == 0)
class TestComponentRun(unittest.TestCase): def setUp(self): self.mock_component_run = ComponentRun("mock") def testCompletenessWithNoInfo(self): status = self.mock_component_run.check_completeness() self.assertFalse(status["success"]) def testCompletenessWithStartEnd(self): cr = copy.deepcopy(self.mock_component_run) cr.set_start_timestamp() cr.set_end_timestamp() status = cr.check_completeness() self.assertTrue(status["success"]) # Assert that there are warning statements # because of no I/O or dependencies msg = status["msg"] self.assertTrue(len(msg) > 0) def testCompletenessWithStartEndDeps(self): cr = copy.deepcopy(self.mock_component_run) cr.set_start_timestamp() cr.set_end_timestamp() # Add I/O and dependencies cr.add_input(IOPointer("input")) cr.add_output(IOPointer("output")) cr.set_upstream(ComponentRun("another_component")) status = cr.check_completeness() self.assertTrue(status["success"]) # Assert that there no warning statements msg = status["msg"] self.assertTrue(len(msg) == 0)
def setUp(self): self.mock_component_run = ComponentRun("mock")
def initialize_empty_component_run(self, component_name: str) -> ComponentRun: """Initializes an empty run for the specified component. Does not commit to the database.""" component_run = ComponentRun(component_name=component_name) return component_run