def build_container_context(self, context: ModelSnapshot) -> ModelSnapshot: # container must be re-used if context.parent_id: return context context = update_instance_status_rest(instance=context, new_status='Building Container') log_container_build = "Started: {}\n".format(get_timestamp()) container_image, build_logs = self._d.images.build(path=str(context.code_directory), rm=False, tag=context.container_image_name) # Convert log generator object to string last_key = "" for log_dict in build_logs: for key in log_dict.keys(): if last_key == key: log_container_build += "\t{}".format(log_dict[key]) else: last_key = key log_container_build += "{}\t{}\n".format(key, log_dict[key]) context = dataclasses.replace(context, container_image_id=container_image.id) context = dataclasses.replace(context, container_build_logs=log_container_build) return context
def push_container_trained_image_context(self, context: ModelSnapshot) -> ModelSnapshot: if self._config.push_trained_images_to_registry: context = update_instance_status_rest(instance=context, new_status=f'Push Trained ModelSnapshot to Registry') i = self._d.images.get(context.new_container_image_name) new_image_tag = f"{self._config.docker_registry_address}{context.new_container_image_name}" # create new tag i.tag(new_image_tag) # reload image i.reload() # push image logs = self._d_low.push(new_image_tag) # Save new image tag as default context = dataclasses.replace(context, new_container_image_name=new_image_tag) merged_logs = f'{context.container_push_logs}\n\nPush new trained model Image:\n{logs}' # TODO: include in email context = dataclasses.replace(context, container_push_logs=merged_logs) # Write Push log write_to_file(Path(context.storage_path) / f"Container_Push_Trained_Model_{get_timestamp(date_format='filename')}.log", logs) return context
def perform_pre_processing_context(self, context: Union[ModelSnapshot, Result]) -> Union[Exception, ModelSnapshot, Result]: if context.pre_processing.get('pre_processing_application', False): context = update_instance_status_rest(instance=context, new_status=f'Pre-Processing Input Data') # Pull pre processing container container_pre_processing_logs = "Preprocessing Container Pull Log: \n" container_pre_processing_logs += self.pull_container(container_image_registry="", container_image_name=context.pre_processing['pre_processing_container_image_name']) # Run pre processing container pre_container = self.run_container( container_image_name=context.pre_processing['pre_processing_container_image_name'], container_name=context.pre_processing['pre_processing_container_name'], container_autoremove=False, mount_volumes=context.pre_processing['container_mount_volumes'] ) # Wait for termination while pre_container.status in CONTAINER_RUNNING_STATUSES: time.sleep(10) # Reload container data pre_container.reload() # Container exited, check if success or failure container_info = dict() container_info["start_time"] = pre_container.attrs["State"]["StartedAt"] container_info["end_time"] = pre_container.attrs["State"]["FinishedAt"] container_info["exit_code"] = pre_container.attrs["State"]["ExitCode"] container_info["exit_message"] = pre_container.attrs["State"]["Error"] container_info["OOMKilled"] = pre_container.attrs["State"]["OOMKilled"] container_info["Dead"] = pre_container.attrs["State"]["Dead"] container_pre_processing_logs += "\nResult: {}\n".format(True if container_info["exit_code"] == 0 else False) if container_info["exit_code"] != 0 and container_info["exit_message"]: container_pre_processing_logs += "Errormessage: {}\n".format(container_info["exit_message"]) container_pre_processing_logs += "Started: {}\n".format(container_info["start_time"]) container_pre_processing_logs += "Finished: {}\n\nOutput:\n\n".format(container_info["end_time"]) container_pre_processing_logs += pre_container.logs(timestamps=True).decode("utf-8") # Write runtime log to disk write_to_file( Path(context.storage_path) / f"Pre_processing_Output_{get_timestamp(date_format='filename')}.log", remove_ansi_escape_tags(container_pre_processing_logs)) context = dataclasses.replace(context, container_pre_processing_logs=container_pre_processing_logs) # remove pre-processing container self.remove_container(container_name=context.pre_processing['pre_processing_container_name']) return context
def pull_container_context(self, context: Union[ModelSnapshot, Result]) -> Union[ModelSnapshot, Result]: context = update_instance_status_rest(instance=context, new_status=f'Pull Model-Container from Registry') log_container_pull = "Started: {}\n".format(get_timestamp()) # Catch the exception if the image is not found (image will be build later) # log_container_pull += excepts(NotFound, # For debugging ignore errors from the registry and continue # TODO: Apply error handling log_container_pull += excepts(Exception, lambda image: self._d_low.pull(image), lambda _: 'image not present')(context.container_image_name) return dataclasses.replace(context, container_pull_logs=log_container_pull)
def run_container_context(self, context: Union[ModelSnapshot, Result]) -> Union[ModelSnapshot, Result]: # check if container is already running if self.get_container(context.container_name): Exception(f"Container '{context.container_name}' is already running. Nothing more to do.") return context else: context = update_instance_status_rest(instance=context, new_status=f'Running') container_id = self._con.run(image=context.container_image_name, name=context.container_name, environment=context.container_environment_variables, auto_remove=False, volumes=context.container_mount_volumes, ports=context.container_ports, shm_size="2G", detach=True) return dataclasses.replace(context, container_id=container_id.id)
def save_container_state(self, context: Union[ModelSnapshot, Result]): # Create archive of container if context.success: context = update_instance_status_rest(instance=context, new_status=f'Export Container State') # commit container state container = self._d.containers.get(context.container_id) new_container_image_name = f'snap_{context.id}' container.commit(repository=new_container_image_name, tag=f'latest', message=f"MMLP: trained model snapshot", author=f"Matthias Greiner") # save the new container name context = dataclasses.replace(context, new_container_image_name=new_container_image_name) else: # Prevent methods with broken snapshots from producing errors (debugging) context = dataclasses.replace(context, new_container_image_name=context.container_image_name) return context
def archive_and_remove_container(self, context: Union[ModelSnapshot, Result]): # Create archive of container context = update_instance_status_rest(instance=context, new_status=f'Export Container Filesystem') container = self._d.containers.get(context.container_id) # Export full filesystem # data_stream = container.export() # Export specific path data_stream = container.get_archive('/data')[0] # Write archive file f = open(str(context.container_archive_path), "wb") for x in data_stream: f.write(x) # Cleanup # Remove the stopped container self.remove_container(str(context.id)) return context
def push_container_context(self, context: ModelSnapshot) -> ModelSnapshot: context = update_instance_status_rest(instance=context, new_status=f'Push Model-Container to Registry') logs = self._d_low.push(context.container_image_name) return dataclasses.replace(context, container_push_logs=logs)