Exemplo n.º 1
0
 def start(self):
     """Return whether we ran anything."""
     self.load_state()
     self.sync_state()
     self.image_manager.start()
     if not self.shared_file_system:
         self.dependency_manager.start()
     while not self.terminate:
         try:
             self.process_runs()
             self.save_state()
             self.checkin()
             self.check_termination()
             self.save_state()
             if self.check_idle_stop() or self.check_num_runs_stop():
                 self.terminate = True
             else:
                 time.sleep(self.checkin_frequency_seconds)
         except Exception:
             self.last_checkin_successful = False
             if using_sentry():
                 capture_exception()
             traceback.print_exc()
             if self.exit_on_exception:
                 logger.warning(
                     'Encountered exception, terminating the worker after sleeping for 5 minutes...'
                 )
                 self.terminate = True
                 # Sleep for 5 minutes
                 time.sleep(5 * 60)
             else:
                 # Sleep for a long time so we don't keep on failing.
                 # We sleep in 5-second increments to check
                 # if the worker needs to terminate (say, if it's received
                 # a SIGTERM signal).
                 logger.warning(
                     'Sleeping for 1 hour due to exception...please help me!'
                 )
                 for _ in range(12 * 60):
                     # We run this here, instead of going through another iteration of the
                     # while loop, to minimize the code that's run---the reason we ended up here
                     # in the first place is because of an exception, so we don't want to
                     # re-trigger that exception.
                     if self.terminate_and_restage:
                         # If self.terminate_and_restage is true, self.check_termination()
                         # restages bundles. We surround this in a try-except block,
                         # so we can still properly terminate and clean up
                         # even if self.check_termination() fails for some reason.
                         try:
                             self.check_termination()
                         except Exception:
                             traceback.print_exc()
                         self.terminate = True
                     if self.terminate:
                         break
                     time.sleep(5)
     self.cleanup()
        def image_availability_state(image_spec, success_message,
                                     failure_message):
            """
            Try to get the image specified by image_spec from host machine.
            Return ImageAvailabilityState.
            """
            try:
                image = self._docker.images.get(image_spec)
                digests = image.attrs.get('RepoDigests', [image_spec])
                digest = digests[0] if len(digests) > 0 else None
                new_timestamp = str(time.time())
                image.tag(self.CACHE_TAG, tag=new_timestamp)
                for tag in image.tags:
                    tag_label, timestamp = tag.split(":")
                    # remove any other timestamp but not the current one
                    if tag_label == self.CACHE_TAG and timestamp != new_timestamp:
                        try:
                            self._docker.images.remove(tag)
                        except docker.errors.NotFound as err:
                            # It's possible that we get a 404 not found error here when removing the image,
                            # since another worker on the same system has already done so. We just
                            # ignore this 404, since any extraneous tags will be removed during the next iteration.
                            logger.warning(
                                "Attempted to remove image %s from cache, but image was not found: %s",
                                tag,
                                err,
                            )

                return ImageAvailabilityState(digest=digest,
                                              stage=DependencyStage.READY,
                                              message=success_message)
            except Exception as ex:
                if using_sentry():
                    capture_exception()
                return ImageAvailabilityState(digest=None,
                                              stage=DependencyStage.FAILED,
                                              message=failure_message % ex)
Exemplo n.º 3
0
def main():
    args = parse_args()

    # Configure logging
    logging.basicConfig(
        format='%(asctime)s %(message)s',
        level=(logging.DEBUG if args.verbose else logging.INFO))

    # Initialize sentry logging
    if using_sentry():
        initialize_sentry()

    # This quits if connection unsuccessful
    bundle_service = connect_to_codalab_server(args.server, args.password_file)

    # Load some data into sentry
    if using_sentry():
        load_sentry_data(username=bundle_service._username, **vars(args))

    if args.shared_file_system:
        # No need to store bundles locally if filesystems are shared
        local_bundles_dir = None
        # Also no need to download dependencies if they're on the filesystem already
        dependency_manager = None
    else:
        local_bundles_dir = os.path.join(args.work_dir, 'runs')
        dependency_manager = DependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            args.max_work_dir_size,
        )
    # Set up local directories
    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)
    if local_bundles_dir and not os.path.exists(local_bundles_dir):
        logger.info('%s doesn\'t exist, creating.', local_bundles_dir)
        os.makedirs(local_bundles_dir, 0o770)

    docker_runtime = docker_utils.get_available_runtime()
    image_manager = DockerImageManager(
        os.path.join(args.work_dir, 'images-state.json'),
        args.max_image_cache_size,
        args.max_image_size,
    )

    worker = Worker(
        image_manager,
        dependency_manager,
        os.path.join(args.work_dir, 'worker-state.json'),
        args.cpuset,
        args.gpuset,
        args.max_memory,
        args.id,
        args.tag,
        args.work_dir,
        local_bundles_dir,
        args.exit_when_idle,
        args.exit_after_num_runs,
        args.idle_seconds,
        args.checkin_frequency_seconds,
        bundle_service,
        args.shared_file_system,
        args.tag_exclusive,
        args.group,
        docker_runtime=docker_runtime,
        docker_network_prefix=args.network_prefix,
        pass_down_termination=args.pass_down_termination,
        delete_work_dir_on_exit=args.delete_work_dir_on_exit,
        exit_on_exception=args.exit_on_exception,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()
    def start_worker_job(self):
        image = 'codalab/worker:' + os.environ.get('CODALAB_VERSION', 'latest')
        worker_id = uuid.uuid4().hex
        logger.debug('Starting worker %s with image %s', worker_id, image)
        work_dir_prefix = (self.args.worker_work_dir_prefix
                           if self.args.worker_work_dir_prefix else "/tmp/")
        # This needs to be a unique directory since Batch jobs may share a host
        work_dir = os.path.join(work_dir_prefix,
                                'cl_worker_{}_work_dir'.format(worker_id))
        command = self.build_command(worker_id, work_dir)

        # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-batch-jobdefinition.html
        # Need to mount:
        # - docker.sock to enable us to start docker in docker
        # - work_dir so that the run bundle's output is visible to the worker
        job_definition = {
            'jobDefinitionName': self.args.job_definition_name,
            'type': 'container',
            'parameters': {},
            'containerProperties': {
                'image':
                image,
                'vcpus':
                self.args.cpus,
                'memory':
                self.args.memory_mb,
                'command': [
                    "/bin/bash",
                    "-c",
                    "/opt/scripts/detect-ec2-spot-preemption.sh & " +
                    " ".join(quote(arg) for arg in command),
                ],
                'environment': [
                    {
                        'name': 'CODALAB_USERNAME',
                        'value': os.environ.get('CODALAB_USERNAME')
                    },
                    {
                        'name': 'CODALAB_PASSWORD',
                        'value': os.environ.get('CODALAB_PASSWORD')
                    },
                ],
                'volumes': [
                    {
                        'host': {
                            'sourcePath': '/var/run/docker.sock'
                        },
                        'name': 'var_run_docker_sock'
                    },
                    {
                        'host': {
                            'sourcePath': work_dir
                        },
                        'name': 'work_dir'
                    },
                ],
                'mountPoints': [
                    {
                        'sourceVolume': 'var_run_docker_sock',
                        'containerPath': '/var/run/docker.sock',
                        'readOnly': False,
                    },
                    {
                        'sourceVolume': 'work_dir',
                        'containerPath': work_dir,
                        'readOnly': False
                    },
                ],
                'readonlyRootFilesystem':
                False,
                'user':
                self.args.user,
            },
            'retryStrategy': {
                'attempts': 1
            },
        }
        if self.args.gpus:
            job_definition["containerProperties"]["resourceRequirements"] = [{
                "value":
                str(self.args.gpus),
                "type":
                "GPU"
            }]

        # Allow worker to directly mount a directory.  Note that the worker
        # needs to be set up a priori with this shared filesystem.
        if os.environ.get('CODALAB_SHARED_FILE_SYSTEM') == 'true':
            command.append('--shared-file-system')
            bundle_mount = os.environ.get('CODALAB_BUNDLE_MOUNT')
            job_definition['containerProperties']['volumes'].append({
                'host': {
                    'sourcePath': bundle_mount
                },
                'name':
                'shared_dir'
            })
            job_definition['containerProperties']['mountPoints'].append({
                'sourceVolume':
                'shared_dir',
                'containerPath':
                bundle_mount,
                'readOnly':
                False
            })

        if using_sentry():
            job_definition["containerProperties"]["environment"].append({
                'name':
                'CODALAB_SENTRY_INGEST_URL',
                'value':
                CODALAB_SENTRY_INGEST
            })
            job_definition["containerProperties"]["environment"].append({
                'name':
                'CODALAB_SENTRY_ENVIRONMENT',
                'value':
                CODALAB_SENTRY_ENVIRONMENT
            })
        # Create a job definition
        response = self.batch_client.register_job_definition(**job_definition)
        logger.info('register_job_definition: %s', response)

        # Submit the job
        response = self.batch_client.submit_job(
            jobName=self.args.job_definition_name,
            jobQueue=self.args.job_queue,
            jobDefinition=self.args.job_definition_name,
        )
        logger.info('submit_job: %s', response)
Exemplo n.º 5
0
    def start_worker_job(self) -> None:
        worker_image: str = 'codalab/worker:' + os.environ.get('CODALAB_VERSION', 'latest')
        worker_id: str = uuid.uuid4().hex
        logger.debug('Starting worker {} with image {}'.format(worker_id, worker_image))
        work_dir_prefix: str = (
            self.args.worker_work_dir_prefix if self.args.worker_work_dir_prefix else "/tmp/"
        )

        # This needs to be a unique directory since Batch jobs may share a host
        work_dir: str = os.path.join(work_dir_prefix, 'cl_worker_{}_work_dir'.format(worker_id))
        command: List[str] = self.build_command(worker_id, work_dir)

        task_container_run_options: List[str] = [
            '--cpus %d' % self.args.cpus,
            '--memory %dM' % self.args.memory_mb,
            '--volume /var/run/docker.sock:/var/run/docker.sock',
            '--volume %s:%s' % (work_dir, work_dir),
            '--user %s' % self.args.user,
        ]

        if os.environ.get('CODALAB_USERNAME') and os.environ.get('CODALAB_PASSWORD'):
            task_container_run_options.extend(
                [
                    '--env CODALAB_USERNAME=%s' % os.environ.get('CODALAB_USERNAME'),
                    '--env CODALAB_PASSWORD=%s' % os.environ.get('CODALAB_PASSWORD'),
                ]
            )
        else:
            raise EnvironmentError(
                'Valid credentials need to be set as environment variables: CODALAB_USERNAME and CODALAB_PASSWORD'
            )

        if os.environ.get('CODALAB_SHARED_FILE_SYSTEM') == 'true':
            # Allow workers to directly mount a directory
            command.append('--shared-file-system')
            task_container_run_options.append(
                '--volume shared_dir:%s' % os.environ.get('CODALAB_BUNDLE_MOUNT')
            )

        # Configure Sentry
        if using_sentry():
            task_container_run_options.append(
                '--env CODALAB_SENTRY_INGEST_URL=%s' % CODALAB_SENTRY_INGEST
            )
            task_container_run_options.append(
                '--env CODALAB_SENTRY_ENVIRONMENT=%s' % CODALAB_SENTRY_ENVIRONMENT
            )

        command_line: str = "/bin/bash -c '{}'".format(' '.join(command))
        logger.debug("Running the following as an Azure Batch task: {}".format(command_line))

        task_id: str = 'cl_worker_{}'.format(worker_id)
        task: TaskAddParameter = TaskAddParameter(
            id=task_id,
            command_line=command_line,
            container_settings=TaskContainerSettings(
                image_name=worker_image, container_run_options=' '.join(task_container_run_options)
            ),
            output_files=[
                OutputFile(
                    file_pattern='../stderr.txt',
                    destination=OutputFileDestination(
                        container=OutputFileBlobContainerDestination(
                            path=task_id, container_url=self.args.log_container_url
                        )
                    ),
                    upload_options=OutputFileUploadOptions(
                        # Upload worker logs once the task completes
                        upload_condition=OutputFileUploadCondition.task_completion
                    ),
                )
            ],
        )

        try:
            # Create a task under the Azure Batch job.
            # Catch request errors to keep the worker manager running.
            self._batch_client.task.add(self.args.job_id, task)
        except (ClientRequestError, BatchErrorException) as e:
            logger.error(
                'Batch request to add task {} to job {} failed: {}'.format(
                    task_id, self.args.job_id, str(e)
                )
            )
Exemplo n.º 6
0
def main():
    args = parse_args()

    if args.tag and not args.tag.replace("-", "").isalnum():
        raise argparse.ArgumentTypeError(
            "Worker tag must only contain letters, numbers or hyphens."
        )

    # Configure logging
    log_format: str = '%(asctime)s %(message)s'
    if args.verbose:
        log_format += ' %(pathname)s %(lineno)d'
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(format=log_format, level=log_level)

    logging.getLogger('urllib3').setLevel(logging.INFO)
    # Initialize sentry logging
    if using_sentry():
        initialize_sentry()

    # This quits if connection unsuccessful
    bundle_service = connect_to_codalab_server(args.server, args.password_file)

    # Load some data into sentry
    if using_sentry():
        load_sentry_data(username=bundle_service._username, **vars(args))

    if args.shared_file_system:
        # No need to store bundles locally if filesystems are shared
        local_bundles_dir = None
        # Also no need to download dependencies if they're on the filesystem already
        dependency_manager = None
    else:
        local_bundles_dir = os.path.join(args.work_dir, 'runs')
        dependency_manager = DependencyManager(
            os.path.join(args.work_dir, 'dependencies-state.json'),
            bundle_service,
            args.work_dir,
            args.max_work_dir_size,
            args.download_dependencies_max_retries,
        )

    if args.container_runtime == "singularity":
        singularity_folder = os.path.join(args.work_dir, 'codalab_singularity_images')
        if not os.path.exists(singularity_folder):
            logger.info(
                'Local singularity image location %s doesn\'t exist, creating.', singularity_folder,
            )
            os.makedirs(singularity_folder, 0o770)
        image_manager = SingularityImageManager(
            args.max_image_size, args.max_image_cache_size, singularity_folder,
        )
        # todo workers with singularity don't work because this is set to none -- handle this
        docker_runtime = None
    else:
        image_manager = DockerImageManager(
            os.path.join(args.work_dir, 'images-state.json'),
            args.max_image_cache_size,
            args.max_image_size,
        )
        docker_runtime = docker_utils.get_available_runtime()
    # Set up local directories
    if not os.path.exists(args.work_dir):
        logging.debug('Work dir %s doesn\'t exist, creating.', args.work_dir)
        os.makedirs(args.work_dir, 0o770)
    if local_bundles_dir and not os.path.exists(local_bundles_dir):
        logger.info('%s doesn\'t exist, creating.', local_bundles_dir)
        os.makedirs(local_bundles_dir, 0o770)

    worker = Worker(
        image_manager,
        dependency_manager,
        # Include the worker ID in the worker state JSON path, so multiple workers
        # sharing the same work directory maintain their own state.
        os.path.join(args.work_dir, f'worker-state-{args.id}.json'),
        args.cpuset,
        args.gpuset,
        args.max_memory,
        args.id,
        args.tag,
        args.work_dir,
        local_bundles_dir,
        args.exit_when_idle,
        args.exit_after_num_runs,
        args.idle_seconds,
        args.checkin_frequency_seconds,
        bundle_service,
        args.shared_file_system,
        args.tag_exclusive,
        args.group,
        docker_runtime=docker_runtime,
        docker_network_prefix=args.network_prefix,
        pass_down_termination=args.pass_down_termination,
        delete_work_dir_on_exit=args.delete_work_dir_on_exit,
        exit_on_exception=args.exit_on_exception,
        shared_memory_size_gb=args.shared_memory_size_gb,
        preemptible=args.preemptible,
    )

    # Register a signal handler to ensure safe shutdown.
    for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
        signal.signal(sig, lambda signup, frame: worker.signal())

    # BEGIN: DO NOT CHANGE THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
    # THIS IS HERE TO KEEP TEST-CLI FROM HANGING
    logger.info('Worker started!')
    # END

    worker.start()