예제 #1
0
    def start(self):
        """
        Starts running the bundle. First, it checks in with the bundle service
        and sees if the bundle is still assigned to this worker. If not, returns
        False. Otherwise, starts the run in a new thread and returns True.
        """
        # Report that the bundle is running. We note the start time here for
        # accurate accounting of time used, since the clock on the bundle
        # service and on the worker could be different.
        self._start_time = time.time()
        start_message = {
            'hostname': socket.gethostname(),
            'start_time': int(self._start_time),
        }
        if not self._bundle_service.start_bundle(self._worker.id, self._uuid,
                                                 start_message):
            return False

        if self._worker.shared_file_system:
            # On a shared file system we create the path in the bundle manager
            # to avoid NFS directory cache issues. Here, we wait for the cache
            # on this machine to expire and for the path to appear.
            while not os.path.exists(self._bundle_path):
                time.sleep(0.5)
        else:
            # Set up a directory to store the bundle.
            remove_path(self._bundle_path)
            os.mkdir(self._bundle_path)

        # Start a thread for this run.
        threading.Thread(target=Run._start, args=[self]).start()

        return True
예제 #2
0
파일: run.py 프로젝트: ppasupat/codalab-cli
    def run(self):
        """
        Starts running the bundle. First, it checks in with the bundle service
        and sees if the bundle is still assigned to this worker. If not, returns
        False. Otherwise, starts the run in a new thread and returns True.
        """
        # Report that the bundle is running. We note the start time here for
        # accurate accounting of time used, since the clock on the bundle
        # service and on the worker could be different.
        self._start_time = time.time()
        start_message = {
            'hostname': socket.gethostname(),
            'start_time': int(self._start_time),
        }
        if not self._bundle_service.start_bundle(self._worker.id, self._uuid,
                                                 start_message):
            return False

        if self._worker.shared_file_system:
            # On a shared file system we create the path in the bundle manager
            # to avoid NFS directory cache issues. Here, we wait for the cache
            # on this machine to expire and for the path to appear.
            while not os.path.exists(self._bundle_path):
                time.sleep(0.5)
        else:
            # Set up a directory to store the bundle.
            remove_path(self._bundle_path)
            os.mkdir(self._bundle_path)

        # Start a thread for this run.
        threading.Thread(target=Run._start, args=[self]).start()

        return True
    def __init__(self,
                 work_dir,
                 max_work_dir_size_bytes,
                 max_dependencies_serialized_length,
                 previous_runs=[]):
        self._max_work_dir_size_bytes = max_work_dir_size_bytes
        self._max_dependencies_serialized_length = max_dependencies_serialized_length or float(
            'inf')
        self._state_file = os.path.join(work_dir, self.STATE_FILENAME)
        self._work_dir = work_dir
        self._bundles_dir = os.path.join(work_dir, 'bundles')
        self._lock = threading.Lock()
        self._stop_cleanup = False
        self._cleanup_thread = None
        self._cleanup_sleep_secs = 10
        self._dependencies = {}
        self._paths = set()

        if os.path.exists(self._state_file):
            self._load_state(previous_runs)
        else:
            remove_path(self._work_dir)
            os.makedirs(self._work_dir, 0770)
            os.makedirs(self._bundles_dir, 0770)
            self._save_state()
예제 #4
0
    def start(self):
        assert self._dependencies is not None, \
            "Tried to start FSM before dependencies were setup. Probably pre_start was not called."

        # TODO Much of this setup logic needs deduplicated with Run.
        # Report that the bundle is running. We note the start time here for
        # accurate accounting of time used, since the clock on the bundle
        # service and on the worker could be different.
        start_message = {
            'hostname': socket.gethostname(),
            'start_time': current_time(),
        }
        if not self._bundle_service.start_bundle(self._worker.id, self._uuid, start_message):
            return False

        if self.is_shared_file_system:
            # On a shared file system we create the path in the bundle manager
            # to avoid NFS directory cache issues. Here, we wait for the cache
            # on this machine to expire and for the path to appear.
            while not os.path.exists(self._bundle_path):
                time.sleep(0.5)
        else:
            # Set up a directory to store the bundle.
            remove_path(self._bundle_path)
            os.mkdir(self._bundle_path)

        self._start_fsm()
        return True
예제 #5
0
    def _do_cleanup(self):
        while not self._should_stop_cleanup():
            while True:
                # If the total size of all dependencies exceeds
                # self._max_work_dir_size_bytes, remove the oldest unused
                # dependency. Otherwise, break out of the loop.
                total_size_bytes = 0
                first_used_time = float('inf')
                first_used_target = None
                self._lock.acquire()
                for target, dependency in self._dependencies.items():
                    if dependency.downloading:
                        continue

                    # We compute the size of dependencies here to keep the code
                    # that adds new bundles to the dependency manager simpler.
                    if dependency.size_bytes is None:
                        self._lock.release()
                        size_bytes = get_path_size(
                            os.path.join(self._bundles_dir, dependency.path))
                        self._lock.acquire()
                        dependency.size_bytes = size_bytes
                        self._save_state()

                    total_size_bytes += dependency.size_bytes
                    if (not dependency.has_children()
                            and dependency.last_used < first_used_time):
                        first_used_time = dependency.last_used
                        first_used_target = target
                self._lock.release()

                if (total_size_bytes > self._max_work_dir_size_bytes
                        and first_used_target is not None):
                    logger.info(
                        'used ({}) exceeds capacity ({}), removing oldest bundle from cache'
                        .format(size_str(total_size_bytes),
                                size_str(self._max_work_dir_size_bytes)))
                    with self._lock:
                        dependency = self._dependencies[first_used_target]
                        if dependency.has_children():
                            # Since we released the lock there could be new
                            # children.
                            continue
                        del self._dependencies[first_used_target]
                        self._paths.remove(dependency.path)
                        self._save_state()
                        remove_path(
                            os.path.join(self._bundles_dir, dependency.path))
                else:
                    break

            # Sleep for 10 seconds, allowing interruptions every second.
            for _ in xrange(0, self._cleanup_sleep_secs):
                time.sleep(1)
                if self._should_stop_cleanup():
                    break
예제 #6
0
 def _store_dependency(self, dependency_path, fileobj, target_type):
     try:
         if target_type == 'directory':
             un_tar_directory(fileobj, dependency_path, 'gz')
         else:
             with open(dependency_path, 'wb') as f:
                 shutil.copyfileobj(fileobj, f)
     except:
         remove_path(dependency_path)
         raise
예제 #7
0
 def _store_dependency(self, dependency_path, fileobj, filename):
     try:
         if filename.endswith('.tar.gz'):
             un_tar_directory(fileobj, dependency_path, 'gz')
         else:
             with open(dependency_path, 'wb') as f:
                 shutil.copyfileobj(un_gzip_stream(fileobj), f)
     except:
         remove_path(dependency_path)
         raise
예제 #8
0
    def _finish(self, exitcode=None, failure_message=None):
        logger.debug(
            'Finished run with UUID %s, exitcode %s, failure_message %s',
            self._uuid, exitcode, failure_message)
        self._set_finished()
        try:
            # Delete the container.
            if self._container_id is not None:
                while True:
                    try:
                        self._docker.delete_container(self._container_id)
                        break
                    except DockerException:
                        traceback.print_exc()
                        time.sleep(1)

            # Clean-up dependencies.
            for dep in self._bundle['dependencies']:
                if not self._worker.shared_file_system:
                    self._worker.remove_dependency(dep['parent_uuid'],
                                                   dep['parent_path'],
                                                   self._uuid)
                # Clean-up the symlinks we created.
                child_path = os.path.join(self._bundle_path, dep['child_path'])
                remove_path(child_path)

            if not self._worker.shared_file_system:
                logger.debug('Uploading results for run with UUID %s',
                             self._uuid)
                updater = self._throttled_updater()

                def update_status(bytes_uploaded):
                    updater('Uploading results: %s done (archived size)' %
                            size_str(bytes_uploaded))

                self._execute_bundle_service_command_with_retry(
                    lambda: self._bundle_service.update_bundle_contents(
                        self._worker.id, self._uuid, self._bundle_path,
                        update_status))

            logger.debug('Finalizing run with UUID %s', self._uuid)
            self._safe_update_run_status(
                'Finished')  # Also, reports the finish time.
            if failure_message is None and self._is_killed():
                failure_message = self._get_kill_message()
            finalize_message = {
                'exitcode': exitcode,
                'failure_message': failure_message,
            }
            self._execute_bundle_service_command_with_retry(
                lambda: self._bundle_service.finalize_bundle(
                    self._worker.id, self._uuid, finalize_message))
        except Exception:
            traceback.print_exc()
예제 #9
0
    def _load_state(self):
        with open(self._state_file, 'r') as f:
            loaded_state = json.loads(f.read())

        # Initialize self._dependencies.
        for dependency in loaded_state:
            dep = self._dependencies[tuple(dependency['target'])] = (
                Dependency.load(dependency, self._lock))
            self._paths.add(dep.path)

        # Remove paths that aren't complete (e.g. interrupted downloads and runs).
        for path in set(os.listdir(self._work_dir)) - self._paths - set(['state.json']):
            remove_path(os.path.join(self._work_dir, path))
예제 #10
0
    def _do_cleanup(self):
        while not self._should_stop_cleanup():
            while True:
                # If the total size of all dependencies exceeds
                # self._max_work_dir_size_mb, remove the oldest unused
                # dependency. Otherwise, break out of the loop.
                total_size_bytes = 0
                first_used_time = float('inf')
                first_used_target = None
                self._lock.acquire()
                for target, dependency in self._dependencies.items():
                    if dependency.downloading:
                        continue

                    # We compute the size of dependencies here to keep the code
                    # that adds new bundles to the dependency manager simpler.
                    if dependency.size_bytes is None:
                        self._lock.release()
                        size_bytes = get_path_size(os.path.join(self._work_dir,
                                                                dependency.path))
                        self._lock.acquire()
                        dependency.size_bytes = size_bytes
                        self._save_state()

                    total_size_bytes += dependency.size_bytes
                    if (not dependency.has_children() and
                        dependency.last_used < first_used_time):
                        first_used_time = dependency.last_used
                        first_used_target = target
                self._lock.release()

                if (total_size_bytes > self._max_work_dir_size_mb * 1024 * 1024 and
                    first_used_target is not None):
                    with self._lock:
                        dependency = self._dependencies[first_used_target]
                        if dependency.has_children():
                            # Since we released the lock there could be new
                            # children.
                            continue
                        del self._dependencies[first_used_target]
                        self._paths.remove(dependency.path)
                        self._save_state()
                        remove_path(os.path.join(self._work_dir, dependency.path))
                else:
                    break

            # Sleep for 10 seconds, allowing interruptions every second.
            for _ in xrange(0, self._cleanup_sleep_secs):
                time.sleep(1)
                if self._should_stop_cleanup():
                    break
예제 #11
0
    def _upgrade(self):
        logger.debug('Upgrading')
        worker_dir = os.path.dirname(os.path.realpath(__file__))

        while True:
            try:
                with closing(self._bundle_service.get_code()) as code:
                    remove_path(worker_dir)
                    un_tar_directory(code, worker_dir, 'gz')
                    break
            except Exception:
                traceback.print_exc()
                time.sleep(1)

        exit(123)
예제 #12
0
파일: run.py 프로젝트: ppasupat/codalab-cli
    def _finish(self, exitcode=None, failure_message=None):
        logger.debug('Finished run with UUID %s, exitcode %s, failure_message %s',
                     self._uuid, exitcode, failure_message)
        self._set_finished()
        try:
            # Delete the container.
            if self._container_id is not None:
                while True:
                    try:
                        self._docker.delete_container(self._container_id)
                        break
                    except DockerException:
                        traceback.print_exc()
                        time.sleep(1)

            # Clean-up dependencies.
            for dep in self._bundle['dependencies']:
                if not self._worker.shared_file_system:
                    self._worker.remove_dependency(
                        dep['parent_uuid'], dep['parent_path'], self._uuid)
                # Clean-up the symlinks we created.
                child_path = os.path.join(self._bundle_path, dep['child_path'])
                remove_path(child_path)

            if not self._worker.shared_file_system:
                logger.debug('Uploading results for run with UUID %s', self._uuid)
                updater = self._throttled_updater()
                def update_status(bytes_uploaded):
                    updater('Uploading results: %s done (archived size)' % 
                        size_str(bytes_uploaded))
                self._execute_bundle_service_command_with_retry(
                    lambda: self._bundle_service.update_bundle_contents(
                        self._worker.id, self._uuid, self._bundle_path,
                        update_status))

            logger.debug('Finalizing run with UUID %s', self._uuid)
            self._safe_update_run_status('Finished')  # Also, reports the finish time.
            if failure_message is None and self._is_killed():
                failure_message = self._get_kill_message()
            finalize_message = {
                'exitcode': exitcode,
                'failure_message': failure_message,
            }
            self._execute_bundle_service_command_with_retry(
                lambda: self._bundle_service.finalize_bundle(
                    self._worker.id, self._uuid, finalize_message))
        except Exception:
            traceback.print_exc()
예제 #13
0
    def __init__(self, work_dir, max_work_dir_size_bytes):
        self._work_dir = work_dir
        self._max_work_dir_size_bytes = max_work_dir_size_bytes
        self._state_file = os.path.join(work_dir, self.STATE_FILENAME)
        self._lock = threading.Lock()
        self._stop_cleanup = False
        self._cleanup_thread = None
        self._cleanup_sleep_secs = 10
        self._dependencies = {}
        self._paths = set()

        if os.path.exists(self._state_file):
            self._load_state()
        else:
            remove_path(work_dir)
            os.makedirs(work_dir, 0770)
            self._save_state()
예제 #14
0
    def __init__(self, work_dir, max_work_dir_size_mb):
        self._work_dir = work_dir
        self._max_work_dir_size_mb = max_work_dir_size_mb
        self._state_file = os.path.join(work_dir, 'state.json')
        self._lock = threading.Lock()
        self._stop_cleanup = False
        self._cleanup_thread = None
        self._cleanup_sleep_secs = 10
        self._dependencies = {}
        self._paths = set()

        if os.path.exists(self._state_file):
            self._load_state()
        else:
            remove_path(work_dir)
            os.makedirs(work_dir, 0770)
            self._save_state()
예제 #15
0
    def _load_state(self):
        with open(self._state_file, 'r') as f:
            loaded_state = json.loads(f.read())

        # Initialize self._dependencies.
        for dependency in loaded_state:
            dep = self._dependencies[tuple(
                dependency['target'])] = (Dependency.load(
                    dependency, self._lock))
            self._paths.add(dep.path)
        logger.info('{} dependencies in cache.'.format(len(
            self._dependencies)))

        # Remove paths that aren't complete (e.g. interrupted downloads and runs).
        for path in set(os.listdir(self._work_dir)) - self._paths - \
                {DependencyManager.STATE_FILENAME, DockerImageManager.STATE_FILENAME}:
            remove_path(os.path.join(self._work_dir, path))
    def _load_state(self, previous_runs):
        with open(self._state_file, 'r') as f:
            loaded_state = json.loads(f.read())

        # Initialize self._dependencies.
        for dependency in loaded_state:
            dep = self._dependencies[tuple(
                dependency['target'])] = (Dependency.load(
                    dependency, self._lock))
            self._paths.add(dep.path)
        logger.info('{} dependencies in cache.'.format(len(
            self._dependencies)))

        for uuid in previous_runs:
            self._paths.add(uuid)

        # Remove paths that aren't complete (e.g. interrupted downloads and runs).
        for path in set(os.listdir(self._bundles_dir)) - self._paths:
            remove_path(os.path.join(self._bundles_dir, path))