示例#1
0
 def create_job_uuids(self):
     job_uuids = super().create_job_uuids()
     job_uuids[TaskType.WORKER] = [
         get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.WORKER, i)
         for i in range(self.get_n_pods(task_type=TaskType.WORKER))]
     job_uuids[TaskType.SERVER] = [
         get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.PS, i)
         for i in range(self.get_n_pods(task_type=TaskType.PS))]
     return job_uuids
示例#2
0
 def create_job_uuids(self):
     return {
         TaskType.MASTER: [
             get_experiment_job_uuid(self.experiment_uuid_instance,
                                     'master', '0')
         ],
     }
示例#3
0
 def create_job_uuids(self):
     job_uuids = {
         TaskType.WORKER:
             [get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.WORKER, i)
              for i in range(self.get_n_pods(task_type=TaskType.WORKER))]
     }
     return job_uuids
 def test_get_experiment_job_uuid(self):
     uuid1 = uuid.uuid4()
     uuid2 = uuid.uuid4().hex
     uuid3 = str(uuid.uuid4())
     data = {
         uuid1: uuid.uuid5(uuid1, 'worker-0').hex,
         uuid2: uuid.uuid5(uuid.UUID(uuid2), 'worker-1').hex,
         uuid3: uuid.uuid5(uuid.UUID(uuid3), 'worker-2').hex,
     }
     for i, kd in enumerate(data.keys()):
         assert get_experiment_job_uuid(experiment_uuid=kd,
                                        task_type='worker',
                                        task_index=i) == data[kd]
示例#5
0
def run(k8s_manager: 'K8SManager') -> None:
    # pylint:disable=too-many-branches

    # Local cache
    label_selector = get_label_selector()
    container_name_experiment_job = conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)
    container_name_tf_job = conf.get(CONTAINER_NAME_TF_JOBS)
    container_name_pytorch_job = conf.get(CONTAINER_NAME_PYTORCH_JOBS)
    container_name_plugin_job = conf.get(CONTAINER_NAME_PLUGIN_JOBS)
    container_name_job = conf.get(CONTAINER_NAME_JOBS)
    container_name_build_job = conf.get(CONTAINER_NAME_BUILD_JOBS)
    watch_ttl = conf.get(TTL_WATCH_STATUSES)
    app_labels_experiment = conf.get(APP_LABELS_EXPERIMENT)
    app_labels_job = conf.get(APP_LABELS_JOB)
    app_labels_build_job = conf.get(APP_LABELS_DOCKERIZER)
    app_labels_tensorboard = conf.get(APP_LABELS_TENSORBOARD)
    app_labels_notebook = conf.get(APP_LABELS_NOTEBOOK)

    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=k8s_manager.namespace,
            container_names=(container_name_experiment_job,
                             container_name_tf_job, container_name_pytorch_job,
                             container_name_plugin_job, container_name_job,
                             container_name_build_job),
            label_selector=label_selector,
            return_event=True,
            watch_ttl=watch_ttl):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)

        experiment_condition = status and labels['app'] == app_labels_experiment

        experiment_job_condition = (
            container_name_experiment_job
            in pod_state['details']['container_statuses']
            or 'job_uuid' in labels)

        tf_job_condition = (container_name_tf_job
                            in pod_state['details']['container_statuses']
                            or 'tf-replica-index' in labels)

        mpi_job_condition = 'mpi_job_name' in labels

        pytorch_job_condition = (container_name_pytorch_job
                                 in pod_state['details']['container_statuses']
                                 or 'pytroch-replica-index' in labels)

        job_condition = (container_name_job
                         in pod_state['details']['container_statuses']
                         or (status and labels['app'] == app_labels_job))

        plugin_job_condition = (
            container_name_plugin_job
            in pod_state['details']['container_statuses']
            or (status and labels['app']
                in (app_labels_tensorboard, app_labels_notebook)))

        dockerizer_job_condition = (
            container_name_build_job
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == app_labels_build_job))

        if experiment_condition:
            if tf_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['tf-replica-index'])
                handle_job_condition(event_object=event_object,
                                     pod_state=pod_state,
                                     status=status,
                                     labels=labels,
                                     container_name=container_name_tf_job,
                                     task_name=K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     update_containers=False)

            elif pytorch_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['pytorch-replica-index'])
                handle_job_condition(event_object=event_object,
                                     pod_state=pod_state,
                                     status=status,
                                     labels=labels,
                                     container_name=container_name_pytorch_job,
                                     task_name=K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     update_containers=False)

            elif mpi_job_condition:
                job_name = pod_state['details']['pod_name']
                parts = job_name.split('-')
                if len(parts) != 4:
                    continue

                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=parts[-1])

                handle_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=container_name_experiment_job,
                    task_name=K8SEventsCeleryTasks.
                    K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                    update_containers=False)

            elif experiment_job_condition:
                handle_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=container_name_experiment_job,
                    task_name=K8SEventsCeleryTasks.
                    K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                    update_containers=False)

        elif job_condition:
            handle_job_condition(
                event_object=event_object,
                pod_state=pod_state,
                status=status,
                labels=labels,
                container_name=container_name_job,
                task_name=K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                update_containers=False)

        elif plugin_job_condition:
            handle_job_condition(event_object=event_object,
                                 pod_state=pod_state,
                                 status=status,
                                 labels=labels,
                                 container_name=container_name_plugin_job,
                                 task_name=K8SEventsCeleryTasks.
                                 K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                                 update_containers=False)

        elif dockerizer_job_condition:
            handle_job_condition(event_object=event_object,
                                 pod_state=pod_state,
                                 status=status,
                                 labels=labels,
                                 container_name=container_name_build_job,
                                 task_name=K8SEventsCeleryTasks.
                                 K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                                 update_containers=False)
        else:
            logger.info("Lost state %s, %s", status, pod_state)
示例#6
0
def run(k8s_manager: 'K8SManager') -> None:
    # pylint:disable=too-many-branches
    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=conf.get(K8S_NAMESPACE),
            container_names=(conf.get(CONTAINER_NAME_EXPERIMENT_JOBS),
                             conf.get(CONTAINER_NAME_TF_JOBS),
                             conf.get(CONTAINER_NAME_PYTORCH_JOBS),
                             conf.get(CONTAINER_NAME_PLUGIN_JOBS),
                             conf.get(CONTAINER_NAME_JOBS),
                             conf.get(CONTAINER_NAME_BUILD_JOBS)),
            label_selector=get_label_selector(),
            return_event=True,
            watch_ttl=conf.get(TTL_WATCH_STATUSES)):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)

        experiment_condition = status and labels['app'] == conf.get(
            APP_LABELS_EXPERIMENT)

        experiment_job_condition = (
            conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)
            in pod_state['details']['container_statuses']
            or 'job_uuid' in labels)

        tf_job_condition = (conf.get(CONTAINER_NAME_TF_JOBS)
                            in pod_state['details']['container_statuses']
                            or 'tf-replica-index' in labels)

        mpi_job_condition = 'mpi_job_name' in labels

        pytorch_job_condition = (conf.get(CONTAINER_NAME_PYTORCH_JOBS)
                                 in pod_state['details']['container_statuses']
                                 or 'pytroch-replica-index' in labels)

        job_condition = (conf.get(CONTAINER_NAME_JOBS)
                         in pod_state['details']['container_statuses']
                         or (status
                             and labels['app'] == conf.get(APP_LABELS_JOB)))

        plugin_job_condition = (
            conf.get(CONTAINER_NAME_PLUGIN_JOBS)
            in pod_state['details']['container_statuses']
            or (status and labels['app'] in (conf.get(APP_LABELS_TENSORBOARD),
                                             conf.get(APP_LABELS_NOTEBOOK))))

        dockerizer_job_condition = (
            conf.get(CONTAINER_NAME_BUILD_JOBS)
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == conf.get(APP_LABELS_DOCKERIZER)))

        if experiment_condition:
            if tf_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['tf-replica-index'])
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_TF_JOBS))

            elif pytorch_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['pytorch-replica-index'])
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_PYTORCH_JOBS))

            elif mpi_job_condition:
                job_name = pod_state['details']['pod_name']
                parts = job_name.split('-')
                if len(parts) != 4:
                    continue

                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=parts[-1])

                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))

            elif experiment_job_condition:
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))

        elif job_condition:
            update_job_containers(event_object, status,
                                  conf.get(CONTAINER_NAME_JOBS))
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)

        elif plugin_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle plugin job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)

        elif dockerizer_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle dockerizer job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)
        else:
            logger.info("Lost state %s, %s", status, pod_state)
示例#7
0
def run(k8s_manager: 'K8SManager') -> None:
    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=conf.get('K8S_NAMESPACE'),
            container_names=(conf.get('CONTAINER_NAME_EXPERIMENT_JOB'),
                             conf.get('CONTAINER_NAME_TF_JOB'),
                             conf.get('CONTAINER_NAME_PYTORCH_JOB'),
                             conf.get('CONTAINER_NAME_PLUGIN_JOB'),
                             conf.get('CONTAINER_NAME_JOB'),
                             conf.get('CONTAINER_NAME_DOCKERIZER_JOB')),
            label_selector=get_label_selector(),
            return_event=True,
            watch_ttl=conf.get('TTL_WATCH_STATUSES')):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)

        experiment_condition = status and labels['app'] == conf.get(
            'APP_LABELS_EXPERIMENT')

        experiment_job_condition = (
            conf.get('CONTAINER_NAME_EXPERIMENT_JOB')
            in pod_state['details']['container_statuses']
            or 'job_uuid' in labels)

        tf_job_condition = (conf.get('CONTAINER_NAME_TF_JOB')
                            in pod_state['details']['container_statuses']
                            or 'tf-replica-index' in labels)

        pytorch_job_condition = (conf.get('CONTAINER_NAME_PYTORCH_JOB')
                                 in pod_state['details']['container_statuses']
                                 or 'pytroch-replica-index' in labels)

        job_condition = (conf.get('CONTAINER_NAME_JOB')
                         in pod_state['details']['container_statuses']
                         or (status
                             and labels['app'] == conf.get('APP_LABELS_JOB')))

        plugin_job_condition = (
            conf.get('CONTAINER_NAME_PLUGIN_JOB')
            in pod_state['details']['container_statuses'] or
            (status and labels['app'] in (conf.get('APP_LABELS_TENSORBOARD'),
                                          conf.get('APP_LABELS_NOTEBOOK'))))

        dockerizer_job_condition = (
            conf.get('CONTAINER_NAME_DOCKERIZER_JOB')
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == conf.get('APP_LABELS_DOCKERIZER')))

        if experiment_condition:
            if tf_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['tf-replica-index'])
                update_job_containers(event_object, status,
                                      conf.get('CONTAINER_NAME_TF_JOB'))
                logger.debug("Sending state to handler %s, %s", status, labels)
                # Handle experiment job statuses
                celery_app.send_task(K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     kwargs={'payload': pod_state})

            elif pytorch_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['pytorch-replica-index'])
                update_job_containers(event_object, status,
                                      conf.get('CONTAINER_NAME_PYTORCH_JOB'))
                logger.debug("Sending state to handler %s, %s", status, labels)
                # Handle experiment job statuses
                celery_app.send_task(K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     kwargs={'payload': pod_state})

            if experiment_job_condition:
                update_job_containers(
                    event_object, status,
                    conf.get('CONTAINER_NAME_EXPERIMENT_JOB'))
                logger.debug("Sending state to handler %s, %s", status, labels)
                # Handle experiment job statuses
                celery_app.send_task(K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     kwargs={'payload': pod_state})

        elif job_condition:
            update_job_containers(event_object, status,
                                  conf.get('CONTAINER_NAME_JOB'))
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle experiment job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                kwargs={'payload': pod_state})

        elif plugin_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle plugin job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                kwargs={'payload': pod_state})

        elif dockerizer_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle dockerizer job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                kwargs={'payload': pod_state})
        else:
            logger.info("Lost state %s, %s", status, pod_state)