def replace(self, result: Result): if not self._results.get(result.id, None): raise Exception("replace_result", f"Result with ID {result.id} is not known") # Write changes to file write_json_file( str(result.storage_path / self._config.result_filename), transform_dataclass_to_dict(result)) # Replace result object self._results.pop(result.id) self._results[result.id] = result return result
def replace(self, snap): if not self._snapshots.get(snap.id, None): raise Exception("replace_snapshot", f"Snapshot with ID {snap.id} is not known") # Update snapshot file write_json_file( str( Path(snap.storage_path) / self._config.model_snapshot_filename), transform_dataclass_to_dict(snap)) # Replace snapshot object self._snapshots.pop(snap.id) self._snapshots[snap.id] = snap return snap
def create_result(self, result: Result) -> Union[Exception, Result]: if self._results.get(result.id, None): raise Exception("create_result", f"Result with ID {result.id} is already known") # Create result on filesystem storage_path = self._config.platform_base_dir / self._config.result_base_dir / str( result.id) Path(storage_path).mkdir(parents=True) result = dataclasses.replace(result, storage_path=storage_path) write_json_file(str(storage_path / self._config.result_filename), transform_dataclass_to_dict(result)) # add result object self._results[result.id] = result return result
def create_snapshot( self, snap: ModelSnapshot) -> Union[Exception, ModelSnapshot]: if self._snapshots.get(snap.id, None): raise Exception("create_snapshot", f"Snapshot with ID {snap.id} is already known") # Create snapshot on filesystem Path(snap.storage_path).mkdir(parents=True) write_json_file( str( Path(snap.storage_path) / self._config.model_snapshot_filename), transform_dataclass_to_dict(snap)) # add snap object self._snapshots[snap.id] = snap return snap
def monitor_wait_container_execution_train(self, context: ModelSnapshot) -> ModelSnapshot: container_statistics = [] log_container_running = "" # Get Container Object running_container = self._d.containers.get(context.container_id) # Wait for termination and collect performance statistics while running_container.status in CONTAINER_RUNNING_STATUSES: # Collect stats container_statistics += [running_container.stats(stream=False)] time.sleep(10) # Reload container data running_container.reload() context = dataclasses.replace(context, container_performance_statistics=container_statistics) # Container exited, check if success or failure container_info = dict() container_info["start_time"] = running_container.attrs["State"]["StartedAt"] container_info["end_time"] = running_container.attrs["State"]["FinishedAt"] container_info["exit_code"] = running_container.attrs["State"]["ExitCode"] container_info["exit_message"] = running_container.attrs["State"]["Error"] container_info["OOMKilled"] = running_container.attrs["State"]["OOMKilled"] container_info["Dead"] = running_container.attrs["State"]["Dead"] if container_info["OOMKilled"]: container_info["exit_message"] = "OOMKilled {}".format(container_info["exit_message"]) elif container_info["Dead"]: container_info["exit_message"] = "Dead {}".format(container_info["exit_message"]) context = dataclasses.replace(context, container_info=container_info) # Preserve the logs log_container_build = context.container_build_logs log_container_pull = context.container_pull_logs log_container_push = f"Started: {get_timestamp()}\n" log_container_push += context.container_push_logs log_container_prepare = f"Pull Log: \n{log_container_pull}\nBuild Log:\n{log_container_build}\nPush Log:\n{log_container_push}" # if pre processing was performed, add the logs to the email if context.container_pre_processing_logs: log_container_prepare += f"\nPre-processing Log: {context.container_pre_processing_logs}" # Write Build log write_to_file(Path(context.storage_path) / f"Container_Build_{get_timestamp(date_format='filename')}.log", log_container_prepare) log_container_running += "Result: {}\n".format(True if container_info["exit_code"] == 0 else False) if container_info["exit_code"] != 0 and container_info["exit_message"]: log_container_running += "Errormessage: {}\n".format(container_info["exit_message"]) log_container_running += "Started: {}\n".format(container_info["start_time"]) log_container_running += "Finished: {}\n\nOutput:\n\n".format(container_info["end_time"]) log_container_running += running_container.logs(timestamps=True).decode("utf-8") context = dataclasses.replace(context, container_run_logs=running_container.logs(timestamps=True).decode("utf-8")) # Write runtime log to disk write_to_file(Path(context.storage_path) / f"Container_Output_{get_timestamp(date_format='filename')}.log", remove_ansi_escape_tags(log_container_running)) # Write performance statistics to file write_json_file(filename=Path(context.storage_path) / f"Container_Performance_statistics.json", content=container_statistics) # Always true for eval purpose if container_info["exit_code"] == 0: # if container_info["exit_code"] == 0 or True: context = dataclasses.replace(context, success=True) if not context.container_image_name.startswith('mon_'): # Send Email to inform user send_email_config(self._config, subject=f"Successful Model Training Pipeline: {context.id}", body="Congratulations!\n" "Your model training pipeline succeeded\n" "Check the attached logs for further details.\n", attachments=[{ "Filename": f"Container-Build_{context.container_name}_{get_timestamp(date_format='filename')}.log", "Content": log_container_prepare }, { "Filename": f"Container-Output_{context.container_name}_{get_timestamp(date_format='filename')}.log", "Content": remove_ansi_escape_tags(log_container_running) }, { "Filename": f"Container-PerformanceStatistics_{context.container_name}_{get_timestamp(date_format='filename')}.json", "Content": json.dumps(container_statistics) } ]) else: context = dataclasses.replace(context, success=False) send_email_config(self._config, subject=f"Failed Model Training Pipeline: {context.container_name}", body="Unfortunately, your training pipeline failed.\n" "Check the attached logs to solve the issues.\n" "If you have questions, please ask your administrator.", attachments=[{ "Filename": f"Container-Build_{context.container_name}_{get_timestamp(date_format='filename')}.log", "Content": log_container_prepare }, { "Filename": f"Container-Output_{context.container_name}_{get_timestamp(date_format='filename')}.log", "Content": remove_ansi_escape_tags(log_container_running) } ]) return context