def run(k8s_manager: 'K8SManager') -> None: for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=conf.get('K8S_NAMESPACE'), container_names=(conf.get('CONTAINER_NAME_EXPERIMENT_JOB'), conf.get('CONTAINER_NAME_PLUGIN_JOB'), conf.get('CONTAINER_NAME_JOB'), conf.get('CONTAINER_NAME_DOCKERIZER_JOB')), label_selector=get_label_selector(), return_event=True, watch_ttl=conf.get('TTL_WATCH_STATUSES')): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_job_condition = ( conf.get('CONTAINER_NAME_EXPERIMENT_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get('APP_LABELS_EXPERIMENT'))) job_condition = (conf.get('CONTAINER_NAME_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get('APP_LABELS_JOB'))) plugin_job_condition = ( conf.get('CONTAINER_NAME_PLUGIN_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] in (conf.get('APP_LABELS_TENSORBOARD'), conf.get('APP_LABELS_NOTEBOOK')))) dockerizer_job_condition = ( conf.get('CONTAINER_NAME_DOCKERIZER_JOB') in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get('APP_LABELS_DOCKERIZER'))) if experiment_job_condition: update_job_containers(event_object, status, conf.get('CONTAINER_NAME_EXPERIMENT_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, kwargs={'payload': pod_state}) elif job_condition: update_job_containers(event_object, status, conf.get('CONTAINER_NAME_JOB')) logger.debug("Sending state to handler %s, %s", status, labels) # Handle experiment job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, kwargs={'payload': pod_state}) elif plugin_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle plugin job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, kwargs={'payload': pod_state}) elif dockerizer_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle dockerizer job statuses celery_app.send_task( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, kwargs={'payload': pod_state}) else: logger.info("Lost state %s, %s", status, pod_state)
def run(k8s_manager: 'K8SManager') -> None: # pylint:disable=too-many-branches for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=conf.get(K8S_NAMESPACE), container_names=(conf.get(CONTAINER_NAME_EXPERIMENT_JOBS), conf.get(CONTAINER_NAME_TF_JOBS), conf.get(CONTAINER_NAME_PYTORCH_JOBS), conf.get(CONTAINER_NAME_PLUGIN_JOBS), conf.get(CONTAINER_NAME_JOBS), conf.get(CONTAINER_NAME_BUILD_JOBS)), label_selector=get_label_selector(), return_event=True, watch_ttl=conf.get(TTL_WATCH_STATUSES)): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_condition = status and labels['app'] == conf.get( APP_LABELS_EXPERIMENT) experiment_job_condition = ( conf.get(CONTAINER_NAME_EXPERIMENT_JOBS) in pod_state['details']['container_statuses'] or 'job_uuid' in labels) tf_job_condition = (conf.get(CONTAINER_NAME_TF_JOBS) in pod_state['details']['container_statuses'] or 'tf-replica-index' in labels) mpi_job_condition = 'mpi_job_name' in labels pytorch_job_condition = (conf.get(CONTAINER_NAME_PYTORCH_JOBS) in pod_state['details']['container_statuses'] or 'pytroch-replica-index' in labels) job_condition = (conf.get(CONTAINER_NAME_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get(APP_LABELS_JOB))) plugin_job_condition = ( conf.get(CONTAINER_NAME_PLUGIN_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] in (conf.get(APP_LABELS_TENSORBOARD), conf.get(APP_LABELS_NOTEBOOK)))) dockerizer_job_condition = ( conf.get(CONTAINER_NAME_BUILD_JOBS) in pod_state['details']['container_statuses'] or (status and labels['app'] == conf.get(APP_LABELS_DOCKERIZER))) if experiment_condition: if tf_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['tf-replica-index']) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_TF_JOBS)) elif pytorch_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['pytorch-replica-index']) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_PYTORCH_JOBS)) elif mpi_job_condition: job_name = pod_state['details']['pod_name'] parts = job_name.split('-') if len(parts) != 4: continue # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=parts[-1]) handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) elif experiment_job_condition: handle_experiment_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)) elif job_condition: update_job_containers(event_object, status, conf.get(CONTAINER_NAME_JOBS)) logger.debug("Sending state to handler %s, %s", status, labels) # Handle job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) elif plugin_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle plugin job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) elif dockerizer_job_condition: logger.debug("Sending state to handler %s, %s", status, labels) # Handle dockerizer job statuses if should_handle_job_status(pod_state=pod_state, status=status): workers.send( K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, kwargs={'payload': pod_state}, countdown=None) else: logger.info("Lost state %s, %s", status, pod_state)
def run(k8s_manager: 'K8SManager') -> None: # pylint:disable=too-many-branches # Local cache label_selector = get_label_selector() container_name_experiment_job = conf.get(CONTAINER_NAME_EXPERIMENT_JOBS) container_name_tf_job = conf.get(CONTAINER_NAME_TF_JOBS) container_name_pytorch_job = conf.get(CONTAINER_NAME_PYTORCH_JOBS) container_name_plugin_job = conf.get(CONTAINER_NAME_PLUGIN_JOBS) container_name_job = conf.get(CONTAINER_NAME_JOBS) container_name_build_job = conf.get(CONTAINER_NAME_BUILD_JOBS) watch_ttl = conf.get(TTL_WATCH_STATUSES) app_labels_experiment = conf.get(APP_LABELS_EXPERIMENT) app_labels_job = conf.get(APP_LABELS_JOB) app_labels_build_job = conf.get(APP_LABELS_DOCKERIZER) app_labels_tensorboard = conf.get(APP_LABELS_TENSORBOARD) app_labels_notebook = conf.get(APP_LABELS_NOTEBOOK) for (event_object, pod_state) in ocular.monitor( k8s_manager.k8s_api, namespace=k8s_manager.namespace, container_names=(container_name_experiment_job, container_name_tf_job, container_name_pytorch_job, container_name_plugin_job, container_name_job, container_name_build_job), label_selector=label_selector, return_event=True, watch_ttl=watch_ttl): logger.debug('-------------------------------------------\n%s\n', pod_state) if not pod_state: continue status = pod_state['status'] labels = None if pod_state['details'] and pod_state['details']['labels']: labels = pod_state['details']['labels'] logger.info("Updating job container %s, %s", status, labels) experiment_condition = status and labels['app'] == app_labels_experiment experiment_job_condition = ( container_name_experiment_job in pod_state['details']['container_statuses'] or 'job_uuid' in labels) tf_job_condition = (container_name_tf_job in pod_state['details']['container_statuses'] or 'tf-replica-index' in labels) mpi_job_condition = 'mpi_job_name' in labels pytorch_job_condition = (container_name_pytorch_job in pod_state['details']['container_statuses'] or 'pytroch-replica-index' in labels) job_condition = (container_name_job in pod_state['details']['container_statuses'] or (status and labels['app'] == app_labels_job)) plugin_job_condition = ( container_name_plugin_job in pod_state['details']['container_statuses'] or (status and labels['app'] in (app_labels_tensorboard, app_labels_notebook))) dockerizer_job_condition = ( container_name_build_job in pod_state['details']['container_statuses'] or (status and labels['app'] == app_labels_build_job)) if experiment_condition: if tf_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['tf-replica-index']) handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_tf_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif pytorch_job_condition: # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=labels['pytorch-replica-index']) handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_pytorch_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif mpi_job_condition: job_name = pod_state['details']['pod_name'] parts = job_name.split('-') if len(parts) != 4: continue # We augment the payload with standard Polyaxon requirement pod_state['details']['labels'][ 'job_uuid'] = get_experiment_job_uuid( experiment_uuid=labels['experiment_uuid'], task_type=labels['task_type'], task_index=parts[-1]) handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_experiment_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif experiment_job_condition: handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_experiment_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES, update_containers=False) elif job_condition: handle_job_condition( event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_job, task_name=K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES, update_containers=False) elif plugin_job_condition: handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_plugin_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES, update_containers=False) elif dockerizer_job_condition: handle_job_condition(event_object=event_object, pod_state=pod_state, status=status, labels=labels, container_name=container_name_build_job, task_name=K8SEventsCeleryTasks. K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES, update_containers=False) else: logger.info("Lost state %s, %s", status, pod_state)
import ocular from kubernetes import client api_client = client.api_client.ApiClient( configuration= 'c29d119df3b14fb7a82207f29c8a2156c505a5948f3e4dcba6229c92b35c9006') for pod_state in ocular.monitor( api_client, namespace='polyaxon', container_names=( 'plx-notebook-be90630d9d0740ada845276f0e3f70a4-749dc96cd-zr29h', ), label_selector='app in (workers,dashboard),type=runner'): print(pod_state)