def test_all_dependencies(self): dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") self.dependency_manager.get("0x2", dependency_key) dependency_key = DependencyKey(parent_uuid="0x3", parent_path="parent2") self.dependency_manager.get("0x4", dependency_key) dependency_keys = self.dependency_manager.all_dependencies self.assertEqual(len(dependency_keys), 2)
def test_get_has(self): dependent_uuid = "0x2" dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") state = self.dependency_manager.get(dependent_uuid, dependency_key) self.assertTrue(self.dependency_manager.has(dependency_key)) self.assertEqual(state.stage, "DOWNLOADING") self.assertEqual(state.path, "0x1_parent") self.assertEqual(state.dependents, {dependent_uuid})
def task(work_dir, state_path, random_file_path): """ Runs the end-to-end workflow of the Dependency Manager. Note: ProcessPoolExecutor must serialize everything before sending it to the worker, so this function needs to be defined at the top-level. # """ # Mock Bundle Service to return a random file object mock_bundle_service = MagicMock() mock_bundle_service.get_bundle_info = MagicMock( return_value={'type': "file"}) file_obj = open(random_file_path, "rb") mock_bundle_service.get_bundle_contents = MagicMock(return_value=file_obj) # Create and start a dependency manager process_id = os.getpid() print(f"{process_id}: Starting a DependencyManager...") dependency_manager = DependencyManager( commit_file=state_path, bundle_service=mock_bundle_service, worker_dir=work_dir, max_cache_size_bytes=2048, download_dependencies_max_retries=1, ) dependency_manager.start() print(f"{process_id}: Started with work directory: {work_dir}.") # Register a run's UUID as a dependent of a parent bundle with UUID 0x1 dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") run_uuid = f"0x{process_id}" state = dependency_manager.get(run_uuid, dependency_key) assert (run_uuid in state.dependents ), f"{process_id}: Expected {run_uuid} as one of the dependents." # Release the run bundle as a dependent dependency_manager.release(run_uuid, dependency_key) dependencies = dependency_manager._fetch_dependencies() if dependency_key in dependencies: state = dependencies[dependency_key] print(f"{process_id}: Checking {run_uuid} in {state.dependents}") assert ( run_uuid not in state.dependents ), f"{process_id}: Dependent should not be in the list of dependents after unregistering." # Keep the dependency manager running for some time to test the loop time.sleep(30) # Stop the Dependency Manager print(f"{process_id}: Stopping DependencyManger...") dependency_manager.stop() print(f"{process_id}: Done.")
def _get_dependency_path(self, run_state, dependency): if self.shared_file_system: # On a shared FS, we know where the dependency is stored and can get the contents directly return os.path.realpath( os.path.join(dependency.location, dependency.parent_path)) else: # On a dependency_manager setup, ask the manager where the dependency is dep_key = DependencyKey(dependency.parent_uuid, dependency.parent_path) return os.path.join( self.dependency_manager.dependencies_dir, self.dependency_manager.get(run_state.bundle.uuid, dep_key).path, )
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) if not self.shared_file_system: # No dependencies if shared fs worker self.dependency_manager.release(run_state.bundle.uuid, dep_key) child_path = os.path.join(run_state.bundle_path, dep.child_path) try: remove_path(child_path) except Exception: logger.error(traceback.format_exc()) if run_state.is_restaged: return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: # No need to upload results since results are directly written to bundle store return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: return self.finalize_run(run_state)
def test_release(self): dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent") self.dependency_manager.get("0x2", dependency_key) state = self.dependency_manager.get("0x3", dependency_key) # Passing in the same dependency key with a different dependent, will just add the dependent self.assertEqual(state.dependents, {"0x2", "0x3"}) # Release 0x2 as a dependent self.dependency_manager.release("0x2", dependency_key) with self.dependency_manager._state_lock: dependencies = self.dependency_manager._fetch_dependencies() state = dependencies[dependency_key] self.assertEqual(state.dependents, {"0x3"}) # Release 0x3 as a dependent - should be left with no dependents self.dependency_manager.release("0x3", dependency_key) with self.dependency_manager._state_lock: dependencies = self.dependency_manager._fetch_dependencies() state = dependencies[dependency_key] self.assertEqual(len(state.dependents), 0)
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ def remove_path_no_fail(path): try: remove_path(path) except Exception: logger.error(traceback.format_exc()) if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: if not self.shared_file_system: # No dependencies if shared fs worker dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) self.dependency_manager.release(run_state.bundle.uuid, dep_key) # Clean up dependencies paths for path in run_state.paths_to_remove or []: remove_path_no_fail(path) run_state = run_state._replace(paths_to_remove=[]) if run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.RESTAGED, reason=self.RESTAGED_REASON, ) return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.UPLOADING_RESULTS, ) return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: # No need to upload results since results are directly written to bundle store # Delete any files that match the exclude_patterns . for exclude_pattern in run_state.bundle.metadata[ "exclude_patterns"]: full_pattern = os.path.join(run_state.bundle_path, exclude_pattern) for file_path in glob.glob(full_pattern, recursive=True): # Only remove files that are subpaths of run_state.bundle_path, in case # that exclude_pattern is something like "../../../". if path_is_parent(parent_path=run_state.bundle_path, child_path=file_path): remove_path(file_path) return self.finalize_run(run_state)
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ def mount_dependency(dependency, shared_file_system): if not shared_file_system: # Set up symlinks for the content at dependency path Path(dependency.child_path).parent.mkdir(parents=True, exist_ok=True) os.symlink(dependency.docker_path, dependency.child_path) # The following will be converted into a Docker volume binding like: # dependency_path:docker_dependency_path:ro docker_dependencies.append( (dependency.parent_path, dependency.docker_path)) if run_state.is_killed or run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= f'the bundle was {"killed" if run_state.is_killed else "restaged"}', ) return run_state._replace(stage=RunStage.CLEANING_UP) # Check CPU and GPU availability try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources.cpus, run_state.resources.gpus) except Exception as e: message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format( run_state.bundle.uuid, str(e)) logger.error(message) logger.error(traceback.format_exc()) return run_state._replace(run_status=message) dependencies_ready = True status_messages = [] if not self.shared_file_system: # No need to download dependencies if we're in the shared FS, # since they're already in our FS for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) dependency_state = self.dependency_manager.get( run_state.bundle.uuid, dep_key) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep.child_path, size_str( dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}', ) return run_state._replace( stage=RunStage.CLEANING_UP, failure_message='Failed to download dependency %s: %s' % (dep.child_path, dependency_state.message), ) # get the docker image docker_image = run_state.resources.docker_image image_state = self.image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image %s %s' % (docker_image, image_state.message)) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP message = 'Failed to download Docker image: %s' % image_state.message logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) logger.info( f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}' ) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. if self.shared_file_system: if not os.path.exists(run_state.bundle_path): if run_state.bundle_dir_wait_num_tries == 0: message = ( "Bundle directory cannot be found on the shared filesystem. " "Please ensure the shared fileystem between the server and " "your worker is mounted properly or contact your administrators." ) log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= "Bundle directory cannot be found on the shared filesystem.", ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1 logger.info( f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}' ) return run_state._replace( run_status= "Waiting for bundle directory to be created by the server", bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries, ) else: remove_path(run_state.bundle_path) os.makedirs(run_state.bundle_path) # 2) Set up symlinks docker_dependencies = [] docker_dependencies_path = ( RunStateMachine._ROOT + run_state.bundle.uuid + ('_dependencies' if not self.shared_file_system else '')) for dep in run_state.bundle.dependencies: full_child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep.child_path)) to_mount = [] dependency_path = self._get_dependency_path(run_state, dep) if dep.child_path == RunStateMachine._CURRENT_DIRECTORY: # Mount all the content of the dependency_path to the top-level of the bundle for child in os.listdir(dependency_path): child_path = os.path.normpath( os.path.join(run_state.bundle_path, child)) to_mount.append( DependencyToMount( docker_path=os.path.join(docker_dependencies_path, child), child_path=child_path, parent_path=os.path.join(dependency_path, child), )) run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [child_path]) else: to_mount.append( DependencyToMount( docker_path=os.path.join(docker_dependencies_path, dep.child_path), child_path=full_child_path, parent_path=dependency_path, )) first_element_of_path = Path(dep.child_path).parts[0] if first_element_of_path == RunStateMachine._ROOT: run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [full_child_path]) else: # child_path can be a nested path, so later remove everything from the first element of the path path_to_remove = os.path.join(run_state.bundle_path, first_element_of_path) run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [path_to_remove]) for dependency in to_mount: try: mount_dependency(dependency, self.shared_file_system) except OSError as e: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason=str(e.__class__), level=logging.ERROR, ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=str(e)) if run_state.resources.network: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name # 3) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, run_state.bundle.uuid, docker_dependencies, run_state.bundle.command, run_state.resources.docker_image, network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources.memory, runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except docker_utils.DockerUserErrorException as e: message = 'Cannot start Docker container: {}'.format(e) log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason='Cannot start Docker container.', level=logging.ERROR, ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) except Exception as e: message = 'Cannot start container: {}'.format(e) logger.error(message) logger.error(traceback.format_exc()) raise return run_state._replace( stage=RunStage.RUNNING, run_status='Running job in container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=RunStage.CLEANING_UP) # Check CPU and GPU availability try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources.cpus, run_state.resources.gpus) except Exception as e: message = "Unexpectedly unable to assign enough resources: %s" % str( e) logger.error(message) logger.error(traceback.format_exc()) return run_state._replace(run_status=message) dependencies_ready = True status_messages = [] if not self.shared_file_system: # No need to download dependencies if we're in the shared FS since they're already in our FS for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) dependency_state = self.dependency_manager.get( run_state.bundle.uuid, dep_key) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep.child_path, size_str( dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP return run_state._replace( stage=RunStage.CLEANING_UP, failure_message='Failed to download dependency %s: %s' % (dep.child_path, dependency_state.message), ) # get the docker image docker_image = run_state.resources.docker_image image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image: ' + (image_state.message or docker_image or "")) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP message = 'Failed to download Docker image: %s' % image_state.message logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. if self.shared_file_system: if not os.path.exists(run_state.bundle_path): if run_state.bundle_dir_wait_num_tries == 0: message = ( "Bundle directory cannot be found on the shared filesystem. " "Please ensure the shared fileystem between the server and " "your worker is mounted properly or contact your administrators." ) logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) return run_state._replace( run_status= "Waiting for bundle directory to be created by the server", bundle_dir_wait_num_tries=run_state. bundle_dir_wait_num_tries - 1, ) else: remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks docker_dependencies = [] docker_dependencies_path = ( '/' + run_state.bundle.uuid + ('_dependencies' if not self.shared_file_system else '')) for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) full_child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep.child_path)) if not full_child_path.startswith(run_state.bundle_path): # Dependencies should end up in their bundles (ie prevent using relative paths like .. # to get out of their parent bundles message = 'Invalid key for dependency: %s' % (dep.child_path) logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) docker_dependency_path = os.path.join(docker_dependencies_path, dep.child_path) if self.shared_file_system: # On a shared FS, we know where the dep is stored and can get the contents directly dependency_path = os.path.realpath( os.path.join(dep.location, dep.parent_path)) else: # On a dependency_manager setup ask the manager where the dependency is dependency_path = os.path.join( self.dependency_manager.dependencies_dir, self.dependency_manager.get(run_state.bundle.uuid, dep_key).path, ) os.symlink(docker_dependency_path, full_child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro docker_dependencies.append( (dependency_path, docker_dependency_path)) if run_state.resources.network: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name # 3) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, run_state.bundle.uuid, docker_dependencies, run_state.bundle.command, run_state.resources.docker_image, network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources.memory, runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except docker_utils.DockerUserErrorException as e: message = 'Cannot start Docker container: {}'.format(e) logger.warning(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) except Exception as e: message = 'Cannot start Docker container: {}'.format(e) logger.error(message) logger.error(traceback.format_exc()) raise return run_state._replace( stage=RunStage.RUNNING, run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )