def run(containers, node, persist): container_ids = RedisJobContainers.get_containers() gpu_resources = get_gpu_resources() if gpu_resources: gpu_resources = {gpu_resource['index']: gpu_resource for gpu_resource in gpu_resources} update_cluster_node(gpu_resources) for container_id in container_ids: container = get_container(containers, container_id) if not container: continue try: payload = get_container_resources(node, containers[container_id], gpu_resources) except KeyError: payload = None if payload: payload = payload.to_dict() # todo: Re-enable publishing # logger.debug("Publishing resources event") # celery_app.send_task( # K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_RESOURCES, # kwargs={'payload': payload, 'persist': persist}) job_uuid = payload['job_uuid'] # Check if we should stream the payload # Check if we have this container already in place experiment_uuid = RedisJobContainers.get_experiment_for_job(job_uuid) set_last_resources_cond = ( RedisToStream.is_monitored_job_resources(job_uuid) or RedisToStream.is_monitored_experiment_resources(experiment_uuid)) if set_last_resources_cond: RedisToStream.set_latest_job_resources(job_uuid, payload)
def test_update_job_containers(self): update_job_containers(event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) # Assert it's still 0 because no job was created with that job_uuid assert len(RedisJobContainers.get_containers()) == 0 # pylint:disable=len-as-condition # Create a job with a specific uuid labels = status_experiment_job_event_with_conditions['object']['metadata']['labels'] ExperimentJobFactory(uuid=labels['job_uuid']) job = ExperimentJob.objects.get(uuid=labels['job_uuid']) update_job_containers(event=status_experiment_job_event_with_conditions['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) # Assert now it has started monitoring the container assert len(RedisJobContainers.get_containers()) == 1 container_id = '539e6a6f4209997094802b0657f90576fe129b7f81697120172836073d9bbd75' assert RedisJobContainers.get_containers() == [container_id] job_uuid, experiment_uuid = RedisJobContainers.get_job(container_id) assert job.uuid.hex == job_uuid assert job.experiment.uuid.hex == experiment_uuid
def test_update_job_containers_with_no_container_statuses(self): update_job_containers(event=status_experiment_job_event['object'], status=JobLifeCycle.BUILDING, job_container_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) assert len(RedisJobContainers.get_containers()) == 0 # pylint:disable=len-as-condition