class WorkerManager(object):
    """
    The abstract class for a worker manager.  Different backends like AWS,
    Azure, Slurm should override `get_worker_jobs` and `start_worker_job`.

    The basic architecture of the WorkerManager is extremely simple: to a
    first-order approximation, it simply launches `cl-worker`s as AWS/Azure
    batch jobs as long as there are staged bundles.

    The simplicity means that we don't need to manage state about how workers
    and bundles are related - that logic is complex and is done by the usual
    worker system, and we don't want to duplicate that for every possible
    worker backend.

    More specifically, a worker manager will monitor a job queue to see how
    many worker jobs are running, and try to keep that between `min_workers`
    and `max_workers`.  It will also monitor the staged bundles that satisfy a
    certain `search` criterion.  If there are staged bundles then it will issue
    a `start_worker_job()` call, provided some other conditions are met (e.g.,
    don't start workers too fast).

    The WorkerManager is all client-side code, so it can be customized as one
    sees fit.

    Notes:
    - The worker manager is not visible via CodaLab (i.e., CodaLab has no
      notion of a worker manager or what it's trying to do - all it sees is
      bundles and workers).  One needs to monitor the AWS/Azure Batch system
      separately.
    - Resource handling is not currently supported.  Generally, the safe thing
      is to create a separate queue for different resource needs and put the
      burden of deciding on the user.
    """

    # Subcommand name to use for this worker manager type
    NAME = 'worker-manager'
    DESCRIPTION = 'Base class for Worker Managers, please implement for your deployment'

    @staticmethod
    def add_arguments_to_subparser(subparser):
        """
        Add any arguments specific to this worker manager to the given subparser
        """
        raise NotImplementedError

    def __init__(self, args):
        self.args = args
        self.codalab_manager = CodaLabManager()
        self.codalab_client = self.codalab_manager.client(args.server)
        self.staged_uuids = []
        self.last_worker_start_time = 0
        logger.info('Started worker manager.')

    def get_worker_jobs(self):
        """Return a list of `WorkerJob`s."""
        raise NotImplementedError

    def start_worker_job(self):
        """Start a new `WorkerJob`."""
        raise NotImplementedError

    def build_command(self, worker_id, work_dir):
        command = [
            self.args.worker_executable,
            '--server',
            self.args.server,
            '--verbose',
            '--exit-when-idle',
            '--idle-seconds',
            str(self.args.worker_idle_seconds),
            '--work-dir',
            work_dir,
            '--id',
            f'$(hostname -s)-{worker_id}',
            '--network-prefix',
            'cl_worker_{}_network'.format(worker_id),
        ]

        # Additional optional arguments
        if self.args.worker_tag:
            command.extend(['--tag', self.args.worker_tag])
        if self.args.worker_group:
            command.extend(['--group', self.args.worker_group])
        if self.args.worker_exit_after_num_runs and self.args.worker_exit_after_num_runs > 0:
            command.extend([
                '--exit-after-num-runs',
                str(self.args.worker_exit_after_num_runs)
            ])
        if self.args.worker_max_work_dir_size:
            command.extend(
                ['--max-work-dir-size', self.args.worker_max_work_dir_size])
        if self.args.worker_delete_work_dir_on_exit:
            command.extend(['--delete-work-dir-on-exit'])
        if self.args.worker_exit_on_exception:
            command.extend(['--exit-on-exception'])
        if self.args.worker_tag_exclusive:
            command.extend(['--tag-exclusive'])
        if self.args.worker_pass_down_termination:
            command.extend(['--pass-down-termination'])

        return command

    def run_loop(self):
        while True:
            try:
                self.run_one_iteration()
            except (
                    urllib.error.URLError,
                    http.client.HTTPException,
                    socket.error,
                    NotFoundError,
                    JsonApiException,
            ):
                # Sometimes, network errors occur when running the WorkerManager . These are often
                # transient exceptions, and retrying the command would lead to success---as a result,
                # we ignore these network-based exceptions (rather than fatally exiting from the
                # WorkerManager )
                traceback.print_exc()
            if self.args.once:
                break
            logger.debug('Sleeping {} seconds'.format(self.args.sleep_time))
            time.sleep(self.args.sleep_time)

    def run_one_iteration(self):
        # Get staged bundles for the current user. The principle here is that we want to get all of
        # the staged bundles can be run by this user.
        keywords = ['state=' + State.STAGED] + self.args.search
        # If the current user is "codalab", don't filter by .mine because the workers owned
        # by "codalab" can be shared by all users. But, for all other users, we only
        # want to see their staged bundles.
        if os.environ.get('CODALAB_USERNAME') != "codalab":
            keywords += [".mine"]
        # The keywords below search for `request_queue=<worker tag>` OR `request_queue=tag=<worker tag>`
        # If support for this is removed so that 'request_queue' is always set to be '<worker tag>'
        # (and not tag=<worker tag>) this search query can again be simplified.
        # NOTE: server/bundle_manager.py has the server-side matching logic that should be synced
        # with this search request.
        if self.args.worker_tag_exclusive and self.args.worker_tag:
            keywords += [
                "request_queue=%s,tag=%s" %
                (self.args.worker_tag, self.args.worker_tag)
            ]

        bundles = self.codalab_client.fetch('bundles',
                                            params={
                                                'worksheet': None,
                                                'keywords': keywords,
                                                'include': ['owner']
                                            })
        new_staged_uuids = [bundle['uuid'] for bundle in bundles]
        old_staged_uuids = self.staged_uuids
        # Bundles that were staged but now aren't
        removed_uuids = [
            uuid for uuid in old_staged_uuids if uuid not in new_staged_uuids
        ]
        self.staged_uuids = new_staged_uuids
        logger.info('Staged bundles [{}]: {}'.format(
            ' '.join(keywords), ' '.join(self.staged_uuids) or '(none)'))

        # Get worker jobs
        worker_jobs = self.get_worker_jobs()
        pending_worker_jobs, active_worker_jobs = [], []

        for job in worker_jobs:
            (active_worker_jobs
             if job.active else pending_worker_jobs).append(job)

        # Print status
        logger.info(
            '{} staged bundles ({} removed since last time), {} worker jobs (min={}, max={}) ({} active, {} pending)'
            .format(
                len(self.staged_uuids),
                len(removed_uuids),
                len(worker_jobs),
                self.args.min_workers,
                self.args.max_workers,
                len(active_worker_jobs),
                len(pending_worker_jobs),
            ))

        want_workers = False

        # There is a staged bundle AND there aren't any workers that are still booting up/starting
        if len(self.staged_uuids) > 0:
            logger.info(
                'Want to launch a worker because we have {} > 0 staged bundles'
                .format(len(self.staged_uuids)))
            want_workers = True

        if want_workers:
            # Make sure we don't launch workers too quickly.
            seconds_since_last_worker = int(time.time() -
                                            self.last_worker_start_time)
            if seconds_since_last_worker < self.args.min_seconds_between_workers:
                logger.info(
                    'Don\'t launch because waited {} < {} seconds since last worker'
                    .format(seconds_since_last_worker,
                            self.args.min_seconds_between_workers))
                want_workers = False

            # Make sure we don't queue up more workers than staged UUIDs if there are
            # more workers still booting up than staged bundles
            if len(pending_worker_jobs) >= len(self.staged_uuids):
                logger.info(
                    'Don\'t launch because still more pending workers than staged bundles ({} >= {})'
                    .format(len(pending_worker_jobs), len(self.staged_uuids)))
                want_workers = False

            # Don't launch more than `max_workers`.
            # For now, only the number of workers is used to determine what workers
            # we launch.
            if len(worker_jobs) >= self.args.max_workers:
                logger.info(
                    'Don\'t launch because too many workers already ({} >= {})'
                    .format(len(worker_jobs), self.args.max_workers))
                want_workers = False

        # We have fewer than min_workers, so launch one regardless of other constraints
        if len(worker_jobs) < self.args.min_workers:
            logger.info(
                'Launch a worker because we are under the minimum ({} < {})'.
                format(len(worker_jobs), self.args.min_workers))
            want_workers = True

        if want_workers:
            logger.info('Starting a worker!')
            self.start_worker_job()
            self.last_worker_start_time = time.time()
示例#2
0
class WorkerManager(object):
    """
    The abstract class for a worker manager.  Different backends like AWS,
    Azure, Slurm should override `get_worker_jobs` and `start_worker_job`.

    The basic architecture of the WorkerManager is extremely simple: to a
    first-order approximation, it simply launches `cl-worker`s as AWS/Azure
    batch jobs as long as there are staged bundles.

    The simplicity means that we don't need to manage state about how workers
    and bundles are related - that logic is complex and is done by the usual
    worker system, and we don't want to duplicate that for every possible
    worker backend.

    More specifically, a worker manager will monitor a job queue to see how
    many worker jobs are running, and try to keep that between `min_workers`
    and `max_workers`.  It will also monitor the staged bundles that satisfy a
    certain `search` criterion.  If there are staged bundles then it will issue
    a `start_worker_job()` call, provided some other conditions are met (e.g.,
    don't start workers too fast).

    The WorkerManager is all client-side code, so it can be customized as one
    sees fit.

    Notes:
    - The worker manager is not visible via CodaLab (i.e., CodaLab has no
      notion of a worker manager or what it's trying to do - all it sees is
      bundles and workers).  One needs to monitor the AWS/Azure Batch system
      separately.
    - Resource handling is not currently supported.  Generally, the safe thing
      is to create a separate queue for different resource needs and put the
      burden of deciding on the user.
    """

    # Subcommand name to use for this worker manager type
    NAME = 'worker-manager'
    DESCRIPTION = 'Base class for Worker Managers, please implement for your deployment'

    @staticmethod
    def add_arguments_to_subparser(subparser):
        """
        Add any arguments specific to this worker manager to the given subparser
        """
        raise NotImplementedError

    def __init__(self, args):
        self.args = args
        self.codalab_manager = CodaLabManager()
        self.codalab_client = self.codalab_manager.client(args.server)
        self.staged_uuids = []
        self.last_worker_start_time = 0
        logger.info('Started worker manager.')

    def get_worker_jobs(self):
        """Return a list of `WorkerJob`s."""
        raise NotImplementedError

    def start_worker_job(self):
        """Start a new `WorkerJob`."""
        raise NotImplementedError

    def run_loop(self):
        while True:
            self.run_one_iteration()
            if self.args.once:
                break
            logger.debug('Sleeping {} seconds'.format(self.args.sleep_time))
            time.sleep(self.args.sleep_time)

    def run_one_iteration(self):
        # Get staged bundles for the current user.
        keywords = ['state=' + State.STAGED] + [".mine"] + self.args.search
        if self.args.worker_tag:
            keywords.append('request_queue=tag=' + self.args.worker_tag)
        bundles = self.codalab_client.fetch('bundles',
                                            params={
                                                'worksheet': None,
                                                'keywords': keywords,
                                                'include': ['owner']
                                            })
        new_staged_uuids = [bundle['uuid'] for bundle in bundles]
        old_staged_uuids = self.staged_uuids
        # Bundles that were staged but now aren't
        removed_uuids = [
            uuid for uuid in old_staged_uuids if uuid not in new_staged_uuids
        ]
        self.staged_uuids = new_staged_uuids
        logger.info('Staged bundles [{}]: {}'.format(
            ' '.join(keywords), ' '.join(self.staged_uuids) or '(none)'))

        # Get worker jobs
        worker_jobs = self.get_worker_jobs()
        pending_worker_jobs, active_worker_jobs = [], []

        for job in worker_jobs:
            (active_worker_jobs
             if job.active else pending_worker_jobs).append(job)

        # Print status
        logger.info(
            '{} staged bundles ({} removed since last time), {} worker jobs (min={}, max={}) ({} active, {} pending)'
            .format(
                len(self.staged_uuids),
                len(removed_uuids),
                len(worker_jobs),
                self.args.min_workers,
                self.args.max_workers,
                len(active_worker_jobs),
                len(pending_worker_jobs),
            ))

        want_workers = False

        # There is a staged bundle AND there aren't any workers that are still booting up/starting
        if len(self.staged_uuids) > 0:
            logger.info(
                'Want to launch a worker because we have {} > 0 staged bundles'
                .format(len(self.staged_uuids)))
            want_workers = True

        if want_workers:
            # Make sure we don't launch workers too quickly.
            seconds_since_last_worker = int(time.time() -
                                            self.last_worker_start_time)
            if seconds_since_last_worker < self.args.min_seconds_between_workers:
                logger.info(
                    'Don\'t launch becaused waited {} < {} seconds since last worker'
                    .format(seconds_since_last_worker,
                            self.args.min_seconds_between_workers))
                want_workers = False

            # Make sure we don't queue up more workers than staged UUIDs if there are
            # more workers still booting up than staged bundles
            if len(pending_worker_jobs) >= len(self.staged_uuids):
                logger.info(
                    'Don\'t launch because still more pending workers than staged bundles ({} >= {})'
                    .format(len(pending_worker_jobs), len(self.staged_uuids)))
                want_workers = False

            # Don't launch more than `max_workers`.
            # For now, only the number of workers is used to determine what workers
            # we launch.
            if len(worker_jobs) >= self.args.max_workers:
                logger.info(
                    'Don\'t launch because too many workers already ({} >= {})'
                    .format(len(worker_jobs), self.args.max_workers))
                want_workers = False

        # We have fewer than min_workers, so launch one regardless of other constraints
        if len(worker_jobs) < self.args.min_workers:
            logger.info(
                'Launch a worker because we are under the minimum ({} < {})'.
                format(len(worker_jobs), self.args.min_workers))
            want_workers = True

        if want_workers:
            logger.info('Starting a worker!')
            self.start_worker_job()
            self.last_worker_start_time = time.time()