def _cleanup(self):
     """
     Prune failed dependencies older than 10 seconds
     Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
     Deletes oldest failed dependencies first and then oldest finished dependencies.
     Doesn't touch downloading dependencies.
     """
     self._prune_failed_dependencies()
     # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted
     while True:
         with self._global_lock:
             self._acquire_all_locks()
             bytes_used = sum(dep.size_bytes
                              for dep in self._dependencies.values())
             serialized_length = len(
                 codalabworker.pyjson.dumps(self._dependencies))
             if (bytes_used > self._max_cache_size_bytes
                     or serialized_length >
                     LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN):
                 logger.debug(
                     '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)',
                     len(self._dependencies),
                     size_str(bytes_used),
                     size_str(self._max_cache_size_bytes),
                     size_str(serialized_length),
                     LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN,
                 )
                 ready_deps = {
                     dep: state
                     for dep, state in self._dependencies.items()
                     if state.stage == DependencyStage.READY
                     and not state.dependents
                 }
                 failed_deps = {
                     dep: state
                     for dep, state in self._dependencies.items()
                     if state.stage == DependencyStage.FAILED
                 }
                 if failed_deps:
                     dep_to_remove = min(
                         failed_deps.iteritems(),
                         key=lambda dep_state: dep_state[1].last_used)[0]
                 elif ready_deps:
                     dep_to_remove = min(
                         ready_deps.iteritems(),
                         key=lambda dep_state: dep_state[1].last_used)[0]
                 else:
                     logger.info(
                         'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over'
                     )
                     break
                 if dep_to_remove:
                     self._delete_dependency(dep_to_remove)
                 self._release_all_locks()
             else:
                 self._release_all_locks()
                 break
 def _cleanup(self):
     """
     Prune failed dependencies older than 10 seconds
     Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
     Deletes oldest failed dependencies first and then oldest finished dependencies.
     Doesn't touch downloading dependencies.
     """
     self._prune_failed_dependencies()
     # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted
     while True:
         with self._global_lock:
             self._acquire_all_locks()
             bytes_used = sum(dep.size_bytes for dep in self._dependencies.values())
             serialized_length = len(codalabworker.pyjson.dumps(self._dependencies))
             if (
                 bytes_used > self._max_cache_size_bytes
                 or serialized_length > LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN
             ):
                 logger.debug(
                     '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)',
                     len(self._dependencies),
                     size_str(bytes_used),
                     size_str(self._max_cache_size_bytes),
                     size_str(serialized_length),
                     LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN,
                 )
                 ready_deps = {
                     dep: state
                     for dep, state in self._dependencies.items()
                     if state.stage == DependencyStage.READY and not state.dependents
                 }
                 failed_deps = {
                     dep: state
                     for dep, state in self._dependencies.items()
                     if state.stage == DependencyStage.FAILED
                 }
                 if failed_deps:
                     dep_to_remove = min(
                         failed_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                     )[0]
                 elif ready_deps:
                     dep_to_remove = min(
                         ready_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                     )[0]
                 else:
                     logger.info(
                         'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over'
                     )
                     break
                 if dep_to_remove:
                     self._delete_dependency(dep_to_remove)
                 self._release_all_locks()
             else:
                 self._release_all_locks()
                 break
Exemplo n.º 3
0
        def check_resource_utilization(run_state):
            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)
            time_used = time.time() - run_state.start_time

            run_state = run_state._replace(time_used=time_used)
            run_state = run_state._replace(
                max_memory=max(run_state.max_memory, run_stats.get('memory', 0))
            )
            run_state = run_state._replace(
                disk_utilization=self.disk_utilization[bundle_uuid]['disk_utilization']
            )

            if (
                run_state.resources['request_time']
                and run_state.time_used > run_state.resources['request_time']
            ):
                kill_messages.append(
                    'Time limit %s exceeded.' % duration_str(run_state.resources['request_time'])
                )

            if (
                run_state.max_memory > run_state.resources['request_memory']
                or run_state.info.get('exitcode', '0') == '137'
            ):
                kill_messages.append(
                    'Memory limit %s exceeded.' % size_str(run_state.resources['request_memory'])
                )

            if (
                run_state.resources['request_disk']
                and run_state.disk_utilization > run_state.resources['request_disk']
            ):
                kill_messages.append(
                    'Disk limit %sb exceeded.' % size_str(run_state.resources['request_disk'])
                )

            if kill_messages:
                new_info = run_state.info
                new_info['kill_message'] = ' '.join(kill_messages)
                run_state = run_state._replace(info=new_info, is_killed=True)

            return run_state
        def check_resource_utilization(run_state):
            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)
            time_used = time.time() - run_state.start_time

            run_state = run_state._replace(time_used=time_used)
            run_state = run_state._replace(max_memory=max(
                run_state.max_memory, run_stats.get('memory', 0)))
            run_state = run_state._replace(
                disk_utilization=self.disk_utilization[bundle_uuid]
                ['disk_utilization'])

            if (run_state.resources['request_time'] and
                    run_state.time_used > run_state.resources['request_time']):
                kill_messages.append(
                    'Time limit %s exceeded.' %
                    duration_str(run_state.resources['request_time']))

            if (run_state.max_memory > run_state.resources['request_memory']
                    or run_state.info.get('exitcode', '0') == '137'):
                kill_messages.append(
                    'Memory limit %s exceeded.' %
                    size_str(run_state.resources['request_memory']))

            if (run_state.resources['request_disk']
                    and run_state.disk_utilization >
                    run_state.resources['request_disk']):
                kill_messages.append(
                    'Disk limit %sb exceeded.' %
                    size_str(run_state.resources['request_disk']))

            if kill_messages:
                new_info = run_state.info
                new_info['kill_message'] = ' '.join(kill_messages)
                run_state = run_state._replace(info=new_info, is_killed=True)

            return run_state
 def update_state_and_check_killed(bytes_downloaded):
     """
     Callback method for bundle service client updates dependency state and
     raises DownloadAbortedException if download is killed by dep. manager
     """
     with self._dependency_locks[dependency]:
         state = self._dependencies[dependency]
         if state.killed:
             raise DownloadAbortedException("Aborted by user")
         self._dependencies[dependency] = state._replace(
             size_bytes=bytes_downloaded,
             message="Downloading dependency: %s downloaded"
             % size_str(bytes_downloaded),
         )
Exemplo n.º 6
0
 def update_state_and_check_killed(bytes_downloaded):
     """
     Callback method for bundle service client updates dependency state and
     raises DownloadAbortedException if download is killed by dep. manager
     """
     with self._dependency_locks[dependency]:
         state = self._dependencies[dependency]
         if state.killed:
             raise DownloadAbortedException("Aborted by user")
         self._dependencies[dependency] = state._replace(
             size_bytes=bytes_downloaded,
             message="Downloading dependency: %s downloaded"
             % size_str(bytes_downloaded),
         )
Exemplo n.º 7
0
 def progress_callback(bytes_uploaded):
     run_status = 'Uploading results: %s done (archived size)' % size_str(
         bytes_uploaded)
     self.uploading[bundle_uuid]['run_status'] = run_status
     return True
Exemplo n.º 8
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      container_id=None)

        dependencies_ready = True
        status_messages = []
        bundle_uuid = run_state.bundle['uuid']

        # get dependencies
        for dep in run_state.bundle['dependencies']:
            dependency = (dep['parent_uuid'], dep['parent_path'])
            dependency_state = self.dependency_manager.get(
                bundle_uuid, dependency)
            if dependency_state.stage == DependencyStage.DOWNLOADING:
                status_messages.append(
                    'Downloading dependency %s: %s done (archived size)' %
                    (dep['child_path'], size_str(dependency_state.size_bytes)))
                dependencies_ready = False
            elif dependency_state.stage == DependencyStage.FAILED:
                # Failed to download dependency; -> CLEANING_UP
                run_state.info[
                    'failure_message'] = 'Failed to download dependency %s: %s' % (
                        dep['child_path'],
                        '',
                    )
                return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                          info=run_state.info)

        # get the docker image
        docker_image = run_state.resources['docker_image']
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            run_state.info['failure_message'] = image_state.message
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        remove_path(run_state.bundle_path)
        os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        dependencies = []
        docker_dependencies_path = '/' + bundle_uuid + '_dependencies'
        for dep in run_state.bundle['dependencies']:
            child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep['child_path']))
            if not child_path.startswith(run_state.bundle_path):
                raise Exception('Invalid key for dependency: %s' %
                                (dep['child_path']))

            dependency_path = self.dependency_manager.get(
                bundle_uuid, (dep['parent_uuid'], dep['parent_path'])).path
            dependency_path = os.path.join(
                self.dependency_manager.dependencies_dir, dependency_path)

            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep['child_path'])

            os.symlink(docker_dependency_path, child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            dependencies.append((dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources['request_network']:
            docker_network = self.docker_network_external_name
        else:
            docker_network = self.docker_network_internal_name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources['request_cpus'],
                run_state.resources['request_gpus'])
        except Exception:
            run_state.info[
                'failure_message'] = "Cannot assign enough resources"
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                bundle_uuid,
                dependencies,
                run_state.bundle['command'],
                run_state.resources['docker_image'],
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources['request_memory'],
                runtime=self.docker_runtime,
            )
        except docker_utils.DockerException as e:
            run_state.info[
                'failure_message'] = 'Cannot start Docker container: {}'.format(
                    e)
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        return run_state._replace(
            stage=LocalRunStage.RUNNING,
            start_time=time.time(),
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
Exemplo n.º 9
0
 def progress_callback(bytes_uploaded):
     run_status = 'Uploading results: %s done (archived size)' % size_str(
         bytes_uploaded
     )
     self.uploading[bundle_uuid]['run_status'] = run_status
     return True
Exemplo n.º 10
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None)

        dependencies_ready = True
        status_messages = []
        bundle_uuid = run_state.bundle['uuid']

        # get dependencies
        for dep in run_state.bundle['dependencies']:
            dependency = (dep['parent_uuid'], dep['parent_path'])
            dependency_state = self.dependency_manager.get(bundle_uuid, dependency)
            if dependency_state.stage == DependencyStage.DOWNLOADING:
                status_messages.append(
                    'Downloading dependency %s: %s done (archived size)'
                    % (dep['child_path'], size_str(dependency_state.size_bytes))
                )
                dependencies_ready = False
            elif dependency_state.stage == DependencyStage.FAILED:
                # Failed to download dependency; -> CLEANING_UP
                run_state.info['failure_message'] = 'Failed to download dependency %s: %s' % (
                    dep['child_path'],
                    '',
                )
                return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # get the docker image
        docker_image = run_state.resources['docker_image']
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append(
                'Pulling docker image: ' + (image_state.message or docker_image or "")
            )
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            run_state.info['failure_message'] = image_state.message
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages
                )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        remove_path(run_state.bundle_path)
        os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        dependencies = []
        docker_dependencies_path = '/' + bundle_uuid + '_dependencies'
        for dep in run_state.bundle['dependencies']:
            child_path = os.path.normpath(os.path.join(run_state.bundle_path, dep['child_path']))
            if not child_path.startswith(run_state.bundle_path):
                raise Exception('Invalid key for dependency: %s' % (dep['child_path']))

            dependency_path = self.dependency_manager.get(
                bundle_uuid, (dep['parent_uuid'], dep['parent_path'])
            ).path
            dependency_path = os.path.join(
                self.dependency_manager.dependencies_dir, dependency_path
            )

            docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path'])

            os.symlink(docker_dependency_path, child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            dependencies.append((dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources['request_network']:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources['request_cpus'], run_state.resources['request_gpus']
            )
        except Exception:
            run_state.info['failure_message'] = "Cannot assign enough resources"
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                bundle_uuid,
                dependencies,
                run_state.bundle['command'],
                run_state.resources['docker_image'],
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources['request_memory'],
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerException as e:
            run_state.info['failure_message'] = 'Cannot start Docker container: {}'.format(e)
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        return run_state._replace(
            stage=LocalRunStage.RUNNING,
            start_time=time.time(),
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )