def create_job_uuids(self): job_uuids = super().create_job_uuids() job_uuids[TaskType.WORKER] = [ get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.WORKER, i) for i in range(self.get_n_pods(task_type=TaskType.WORKER))] job_uuids[TaskType.SERVER] = [ get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.PS, i) for i in range(self.get_n_pods(task_type=TaskType.PS))] return job_uuids
def create_job_uuids(self): return { TaskType.MASTER: [ get_experiment_job_uuid(self.experiment_uuid_instance, 'master', '0') ], }
def create_job_uuids(self): job_uuids = { TaskType.WORKER: [get_experiment_job_uuid(self.experiment_uuid_instance, TaskType.WORKER, i) for i in range(self.get_n_pods(task_type=TaskType.WORKER))] } return job_uuids
def test_get_experiment_job_uuid(self): uuid1 = uuid.uuid4() uuid2 = uuid.uuid4().hex uuid3 = str(uuid.uuid4()) data = { uuid1: uuid.uuid5(uuid1, 'worker-0').hex, uuid2: uuid.uuid5(uuid.UUID(uuid2), 'worker-1').hex, uuid3: uuid.uuid5(uuid.UUID(uuid3), 'worker-2').hex, } for i, kd in enumerate(data.keys()): assert get_experiment_job_uuid(experiment_uuid=kd, task_type='worker', task_index=i) == data[kd]
def run(k8s_manager: 'K8SManager') -> None: # pylint:disable=too-many-branches # Local cache label_selector = get_label_selector() container_name_experiment_job = conf.get(CONTAINER_NAME_EXPERIMENT_JOBS) container_name_tf_job = conf.get(CONTAINER_NAME_TF_JOBS) container_name_pytorch_job = conf.get(CONTAINER_NAME_PYTORCH_JOBS) container_name_plugin_job = conf.get(CONTAINER_NAME_PLUGIN_JOBS) container_name_job = conf.get(CONTAINER_NAME_JOBS) container_name_build_job = conf.get(CONTAINER_NAME_BUILD_JOBS) watch_ttl = conf.get(TTL_WATCH_STATUSES) app_labels_experiment = conf.get(APP_LABELS_EXPERIMENT) app_labels_job = conf.get(APP_LABELS_JOB) app_labels_build_job = conf.get(APP_LABELS_DOCKERIZER) app_labels_tensorboard = conf.get(APP_LABELS_TENSORBOARD) app_labels_notebook = conf.get(APP_LABELS_NOTEBOOK) for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=k8s_manager.namespace, container_names=(container_name_experiment_job, container_name_tf_job, container_name_pytorch_job, container_name_plugin_job, container_name_job, container_name_build_job), label_selector=label_selector, return_event=True, watch_ttl=watch_ttl): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_condition = status and labels['app'] == app_labels_experiment experiment_job_condition = ( container_name_experiment_job in pod_state['details']['container_statuses'] or 'job_uuid' in labels) tf_job_condition = (container_name_tf_job in pod_state['details']['container_statuses'] or 'tf-replica-index' in labels) mpi_job_condition = 'mpi_job_name' in labels pytorch_job_condition = (container_name_pytorch_job in pod_state['details']['container_statuses'] or 'pytroch-replica-index' in labels) job_condition = (container_name_job in pod_state['details']['container_statuses'] or (status and labels['app'] == app_labels_job)) plugin_job_condition = ( container_name_plugin_job in pod_state['details']['container_statuses'] or (status and labels['app'] in (app_labels_tensorboard, app_labels_notebook))) dockerizer_job_condition = ( container_name_build_job in pod_state['details']['container_statuses'] or (status and labels['app'] == app_labels_build_job)) if experiment_condition: if tf_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['tf-replica-index']) handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_tf_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif pytorch_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['pytorch-replica-index']) handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_pytorch_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif mpi_job_condition: job_name = pod_state['details']['pod_name'] parts = job_name.split('-') if len(parts) != 4: continue # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=parts[-1]) handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_experiment_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif experiment_job_condition: handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_experiment_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif job_condition: handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_job, task_name=K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, update_containers=False) elif plugin_job_condition: handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_plugin_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, update_containers=False) elif dockerizer_job_condition: handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_build_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, update_containers=False) else: logger.info("Lost state %s, %s", status, pod_state)
def run(k8s_manager: 'K8SManager') -> None: # pylint:disable=too-many-branches for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=conf.get(K8S_NAMESPACE), container_names=(conf.get(CONTAINER_NAME_EXPERIMENT_JOBS), conf.get(CONTAINER_NAME_TF_JOBS), conf.get(CONTAINER_NAME_PYTORCH_JOBS), conf.get(CONTAINER_NAME_PLUGIN_JOBS), conf.get(CONTAINER_NAME_JOBS), conf.get(CONTAINER_NAME_BUILD_JOBS)), label_selector=get_label_selector(), return_event=True, watch_ttl=conf.get(TTL_WATCH_STATUSES)): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_condition = status and labels['app'] == conf.get( APP_LABELS_EXPERIMENT) experiment_job_condition = ( conf.get(CONTAINER_NAME_EXPERIMENT_JOBS) in pod_state['details']['container_statuses'] or 'job_uuid' in labels) tf_job_condition = (conf.get(CONTAINER_NAME_TF_JOBS) in pod_state['details']['container_statuses'] or 'tf-replica-index' in labels) mpi_job_condition = 'mpi_job_name' in labels pytorch_job_condition = (conf.get(CONTAINER_NAME_PYTORCH_JOBS) in pod_state['details']['container_statuses'] or 'pytroch-replica-index' in labels) job_condition = (conf.get(CONTAINER_NAME_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get(APP_LABELS_JOB))) plugin_job_condition = ( conf.get(CONTAINER_NAME_PLUGIN_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] in (conf.get(APP_LABELS_TENSORBOARD), conf.get(APP_LABELS_NOTEBOOK)))) dockerizer_job_condition = ( conf.get(CONTAINER_NAME_BUILD_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get(APP_LABELS_DOCKERIZER))) if experiment_condition: if tf_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['tf-replica-index']) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_TF_JOBS)) elif pytorch_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['pytorch-replica-index']) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_PYTORCH_JOBS)) elif mpi_job_condition: job_name = pod_state['details']['pod_name'] parts = job_name.split('-') if len(parts) != 4: continue # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=parts[-1]) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) elif experiment_job_condition: handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) elif job_condition: update_job_containers(event_object, status, conf.get(CONTAINER_NAME_JOBS)) logger.debug("Sending state to handler %s, %s", status, labels) # Handle job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) elif plugin_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle plugin job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) elif dockerizer_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle dockerizer job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) else: logger.info("Lost state %s, %s", status, pod_state)
def run(k8s_manager: 'K8SManager') -> None: for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=conf.get('K8S_NAMESPACE'), container_names=(conf.get('CONTAINER_NAME_EXPERIMENT_JOB'), conf.get('CONTAINER_NAME_TF_JOB'), conf.get('CONTAINER_NAME_PYTORCH_JOB'), conf.get('CONTAINER_NAME_PLUGIN_JOB'), conf.get('CONTAINER_NAME_JOB'), conf.get('CONTAINER_NAME_DOCKERIZER_JOB')), label_selector=get_label_selector(), return_event=True, watch_ttl=conf.get('TTL_WATCH_STATUSES')): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_condition = status and labels['app'] == conf.get( 'APP_LABELS_EXPERIMENT') experiment_job_condition = ( conf.get('CONTAINER_NAME_EXPERIMENT_JOB') in pod_state['details']['container_statuses'] or 'job_uuid' in labels) tf_job_condition = (conf.get('CONTAINER_NAME_TF_JOB') in pod_state['details']['container_statuses'] or 'tf-replica-index' in labels) pytorch_job_condition = (conf.get('CONTAINER_NAME_PYTORCH_JOB') in pod_state['details']['container_statuses'] or 'pytroch-replica-index' in labels) job_condition = (conf.get('CONTAINER_NAME_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get('APP_LABELS_JOB'))) plugin_job_condition = ( conf.get('CONTAINER_NAME_PLUGIN_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] in (conf.get('APP_LABELS_TENSORBOARD'), conf.get('APP_LABELS_NOTEBOOK')))) dockerizer_job_condition = ( conf.get('CONTAINER_NAME_DOCKERIZER_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get('APP_LABELS_DOCKERIZER'))) if experiment_condition: if tf_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['tf-replica-index']) update_job_containers(event_object, status, conf.get('CONTAINER_NAME_TF_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task(K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state}) elif pytorch_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['pytorch-replica-index']) update_job_containers(event_object, status, conf.get('CONTAINER_NAME_PYTORCH_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task(K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state}) if experiment_job_condition: update_job_containers( event_object, status, conf.get('CONTAINER_NAME_EXPERIMENT_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task(K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state}) elif job_condition: update_job_containers(event_object, status, conf.get('CONTAINER_NAME_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, kwargs={'payload': pod_state}) elif plugin_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle plugin job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, kwargs={'payload': pod_state}) elif dockerizer_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle dockerizer job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, kwargs={'payload': pod_state}) else: logger.info("Lost state %s, %s", status, pod_state)