def check_and_report_finished(run_state):
     try:
         finished, exitcode, failure_msg = docker_utils.check_finished(run_state.container)
     except docker_utils.DockerException:
         logger.error(traceback.format_exc())
         finished, exitcode, failure_msg = False, None, None
     return run_state._replace(
         finished=finished, exitcode=exitcode, failure_message=failure_msg
     )
Пример #2
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
            if not self.shared_file_system:  # No dependencies if shared fs worker
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

            child_path = os.path.join(run_state.bundle_path, dep.child_path)
            try:
                remove_path(child_path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.is_restaged:
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            # No need to upload results since results are directly written to bundle store
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            return self.finalize_run(run_state)
Пример #3
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        def remove_path_no_fail(path):
            try:
                remove_path(path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            if not self.shared_file_system:  # No dependencies if shared fs worker
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

        # Clean up dependencies paths
        for path in run_state.paths_to_remove or []:
            remove_path_no_fail(path)
        run_state = run_state._replace(paths_to_remove=[])

        if run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.RESTAGED,
                reason=self.RESTAGED_REASON,
            )
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.UPLOADING_RESULTS,
            )
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            # No need to upload results since results are directly written to bundle store
            # Delete any files that match the exclude_patterns .
            for exclude_pattern in run_state.bundle.metadata[
                    "exclude_patterns"]:
                full_pattern = os.path.join(run_state.bundle_path,
                                            exclude_pattern)
                for file_path in glob.glob(full_pattern, recursive=True):
                    # Only remove files that are subpaths of run_state.bundle_path, in case
                    # that exclude_pattern is something like "../../../".
                    if path_is_parent(parent_path=run_state.bundle_path,
                                      child_path=file_path):
                        remove_path(file_path)
            return self.finalize_run(run_state)
Пример #4
0
    def _transition_from_RUNNING(self, run_state):
        """
        1- Check run status of the docker container
        2- If run is killed, kill the container
        3- If run is finished, move to CLEANING_UP state
        """
        def check_and_report_finished(run_state):
            try:
                finished, exitcode, failure_msg = docker_utils.check_finished(
                    run_state.container)
            except docker_utils.DockerException:
                logger.error(traceback.format_exc())
                finished, exitcode, failure_msg = False, None, None
            return run_state._replace(finished=finished,
                                      exitcode=exitcode,
                                      failure_message=failure_msg)

        def check_resource_utilization(run_state: RunState):
            logger.info(
                f'Checking resource utilization for bundle. uuid: {run_state.bundle.uuid}'
            )
            cpu_usage, memory_usage = docker_utils.get_container_stats_with_docker_stats(
                run_state.container)
            run_state = run_state._replace(cpu_usage=cpu_usage,
                                           memory_usage=memory_usage)
            run_state = run_state._replace(memory_usage=memory_usage)

            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)

            run_state = run_state._replace(max_memory=max(
                run_state.max_memory, run_stats.get('memory', 0)))
            run_state = run_state._replace(
                disk_utilization=self.disk_utilization[
                    run_state.bundle.uuid]['disk_utilization'])

            container_time_total = docker_utils.get_container_running_time(
                run_state.container)
            run_state = run_state._replace(
                container_time_total=container_time_total,
                container_time_user=run_stats.get(
                    'container_time_user', run_state.container_time_user),
                container_time_system=run_stats.get(
                    'container_time_system', run_state.container_time_system),
            )

            if run_state.resources.time and container_time_total > run_state.resources.time:
                kill_messages.append(
                    'Time limit exceeded. (Container uptime %s > time limit %s)'
                    % (duration_str(container_time_total),
                       duration_str(run_state.resources.time)))

            if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137:
                kill_messages.append('Memory limit %s exceeded.' %
                                     size_str(run_state.resources.memory))

            if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk:
                kill_messages.append('Disk limit %sb exceeded.' %
                                     size_str(run_state.resources.disk))

            if kill_messages:
                run_state = run_state._replace(
                    kill_message=' '.join(kill_messages), is_killed=True)
            return run_state

        def check_disk_utilization():
            logger.info(
                f'Checking disk utilization for bundle. uuid: {run_state.bundle.uuid}'
            )
            running = True
            while running:
                start_time = time.time()
                try:
                    disk_utilization = get_path_size(run_state.bundle_path)
                    self.disk_utilization[run_state.bundle.uuid][
                        'disk_utilization'] = disk_utilization
                    running = self.disk_utilization[
                        run_state.bundle.uuid]['running']
                except Exception:
                    logger.error(traceback.format_exc())
                end_time = time.time()

                # To ensure that we don't hammer the disk for this computation when
                # there are lots of files, we run it at most 10% of the time.
                time.sleep(max((end_time - start_time) * 10, 1.0))

        self.disk_utilization.add_if_new(
            run_state.bundle.uuid,
            threading.Thread(target=check_disk_utilization, args=[]))
        run_state = check_and_report_finished(run_state)
        run_state = check_resource_utilization(run_state)

        if run_state.is_killed or run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason=
                f'the bundle was {"killed" if run_state.is_killed else "restaged"}',
            )
            if docker_utils.container_exists(run_state.container):
                try:
                    run_state.container.kill()
                except docker.errors.APIError:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if not finished:
                        logger.error(traceback.format_exc())
            self.disk_utilization[run_state.bundle.uuid]['running'] = False
            self.disk_utilization.remove(run_state.bundle.uuid)
            return run_state._replace(stage=RunStage.CLEANING_UP)
        if run_state.finished:
            logger.debug(
                'Finished run with UUID %s, exitcode %s, failure_message %s',
                run_state.bundle.uuid,
                run_state.exitcode,
                run_state.failure_message,
            )
            self.disk_utilization[run_state.bundle.uuid]['running'] = False
            self.disk_utilization.remove(run_state.bundle.uuid)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      run_status='Uploading results')
        else:
            return run_state
    def _transition_from_RUNNING(self, run_state):
        """
        1- Check run status of the docker container
        2- If run is killed, kill the container
        3- If run is finished, move to CLEANING_UP state
        """

        def check_and_report_finished(run_state):
            try:
                finished, exitcode, failure_msg = docker_utils.check_finished(run_state.container)
            except docker_utils.DockerException:
                logger.error(traceback.format_exc())
                finished, exitcode, failure_msg = False, None, None
            return run_state._replace(
                finished=finished, exitcode=exitcode, failure_message=failure_msg
            )

        def check_resource_utilization(run_state):
            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)

            container_time_total = docker_utils.get_container_running_time(run_state.container)
            run_state = run_state._replace(
                container_time_total=container_time_total,
                container_time_user=run_stats.get(
                    'container_time_user', run_state.container_time_user
                ),
                container_time_system=run_stats.get(
                    'container_time_system', run_state.container_time_system
                ),
            )

            run_state = run_state._replace(
                max_memory=max(run_state.max_memory, run_stats.get('memory', 0))
            )

            run_state = check_disk_utilization(run_state)

            if run_state.resources.time and container_time_total > run_state.resources.time:
                kill_messages.append(
                    'Time limit exceeded. (Container uptime %s > time limit %s)'
                    % (duration_str(container_time_total), duration_str(run_state.resources.time))
                )

            if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137:
                kill_messages.append(
                    'Memory limit %s exceeded.' % size_str(run_state.resources.memory)
                )

            if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk:
                kill_messages.append(
                    'Disk limit %sb exceeded.' % size_str(run_state.resources.disk)
                )

            if kill_messages:
                run_state = run_state._replace(kill_message=' '.join(kill_messages), is_killed=True)
            return run_state

        def check_disk_utilization(run_state):
            try:
                disk_utilization = get_path_size(run_state.bundle_path)
                run_state._replace(disk_utilization=disk_utilization)
            except Exception:
                logger.error(traceback.format_exc())
            return run_state

        run_state = check_and_report_finished(run_state)
        run_state = check_resource_utilization(run_state)

        if run_state.is_killed:
            if docker_utils.container_exists(run_state.container):
                try:
                    run_state.container.kill()
                except docker.errors.APIError:
                    finished, _, _ = docker_utils.check_finished(run_state.container)
                    if not finished:
                        logger.error(traceback.format_exc())
            return run_state._replace(stage=RunStage.CLEANING_UP)
        if run_state.finished:
            logger.debug(
                'Finished run with UUID %s, exitcode %s, failure_message %s',
                run_state.bundle.uuid,
                run_state.exitcode,
                run_state.failure_message,
            )
            return run_state._replace(stage=RunStage.CLEANING_UP, run_status='Uploading results')
        else:
            return run_state