def test_master_success_influences_other_experiment_workers_status(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async' ) as _: # noqa with patch.object(Experiment, 'set_status') as _: # noqa experiment = ExperimentFactory() assert ExperimentLifeCycle.is_done(experiment.last_status) is False # Add jobs master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER) assert JobLifeCycle.is_done(master.last_status) is False workers = [ ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER) for _ in range(2) ] for worker in workers: worker.refresh_from_db() assert JobLifeCycle.is_done(worker.last_status) is False # Set master to succeeded ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED) # All worker should have a success status for worker in workers: worker.refresh_from_db() assert worker.last_status == JobLifeCycle.SUCCEEDED # Experiment last status should be success experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
async def log_job(request, ws, job, pod_id, namespace, container): job_uuid = job.uuid.hex if job_uuid in request.app.job_logs_ws_managers: ws_manager = request.app.job_logs_ws_managers[job_uuid] else: ws_manager = SocketManager() request.app.job_logs_ws_managers[job_uuid] = ws_manager ws_manager.add_socket(ws) # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify_ws(ws=ws, message=get_status_message(status)) if should_disconnect(ws=ws, ws_manager=ws_manager): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify_ws(ws=ws, message=get_status_message(status)) return config.load_incluster_config() k8s_api = client.CoreV1Api() await log_job_pod(k8s_api=k8s_api, ws=ws, ws_manager=ws_manager, pod_id=pod_id, container=container, namespace=namespace)
def update_job_containers(event: Mapping, status: str, job_container_name: str) -> None: if JobLifeCycle.is_done(status): # Remove the job monitoring job_uuid = event['metadata']['labels']['job_uuid'] logger.info('Stop monitoring job_uuid: %s', job_uuid) RedisJobContainers.remove_job(job_uuid) if event['status']['container_statuses'] is None: return def get_container_id(container_id): if not container_id: return None if container_id.startswith('docker://'): return container_id[len('docker://'):] return container_id for container_status in event['status']['container_statuses']: if container_status['name'] != job_container_name: continue container_id = get_container_id(container_status['container_id']) if container_id: job_uuid = event['metadata']['labels']['job_uuid'] if container_status['state']['running'] is not None: logger.info('Monitoring (container_id, job_uuid): (%s, %s)', container_id, job_uuid) RedisJobContainers.monitor(container_id=container_id, job_uuid=job_uuid) else: RedisJobContainers.remove_container(container_id=container_id)
def build_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'finished_at']) auditor.record(event_type=BUILD_JOB_NEW_STATUS, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.CREATED: auditor.record(event_type=BUILD_JOB_CREATED, instance=job) elif instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=BUILD_JOB_STOPPED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.FAILED: auditor.record(event_type=BUILD_JOB_FAILED, instance=job, previous_status=previous_status) elif instance.status == JobLifeCycle.SUCCEEDED: auditor.record(event_type=BUILD_JOB_SUCCEEDED, instance=job, previous_status=previous_status) # handle done status if JobLifeCycle.is_done(instance.status): auditor.record(event_type=BUILD_JOB_DONE, instance=job, previous_status=previous_status)
def build_handle_done_status(sender, **kwargs): instance = kwargs['instance'] build_job_id = instance.job_id if JobLifeCycle.is_done(instance.status): celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': build_job_id})
def update_job_containers(event, status, job_container_name): if JobLifeCycle.is_done(status): # Remove the job monitoring job_uuid = event['metadata']['labels']['job_uuid'] logger.info('Stop monitoring job_uuid: %s', job_uuid) RedisJobContainers.remove_job(job_uuid) if event['status']['container_statuses'] is None: return def get_container_id(container_id): if not container_id: return None if container_id.startswith('docker://'): return container_id[len('docker://'):] return container_id for container_status in event['status']['container_statuses']: if container_status['name'] != job_container_name: continue container_id = get_container_id(container_status['container_id']) if container_id: job_uuid = event['metadata']['labels']['job_uuid'] if container_status['state']['running'] is not None: logger.info('Monitoring (container_id, job_uuid): (%s, %s)', container_id, job_uuid) RedisJobContainers.monitor(container_id=container_id, job_uuid=job_uuid) else: RedisJobContainers.remove_container(container_id=container_id)
def calculated_status(self): master_status = self.jobs.filter(role=TaskType.MASTER)[0].last_status calculated_status = master_status if JobLifeCycle.is_done(master_status) else None if calculated_status is None: calculated_status = ExperimentLifeCycle.jobs_status(self.last_job_statuses) if calculated_status is None: return self.last_status return calculated_status
def build_handle_done_status(sender, **kwargs): instance = kwargs['instance'] build_job_id = instance.job_id if JobLifeCycle.is_done(instance.status): celery_app.send_task( SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': build_job_id})
def calculated_status(self) -> str: master_status = self.jobs.order_by('created_at').first().last_status calculated_status = master_status if JobLifeCycle.is_done( master_status) else None if calculated_status is None: calculated_status = ExperimentLifeCycle.jobs_status( self.last_job_statuses) if calculated_status is None: return self.last_status return calculated_status
def build_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status', 'started_at', 'finished_at']) auditor.record(event_type=BUILD_JOB_NEW_STATUS, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=BUILD_JOB_STOPPED, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.FAILED: auditor.record(event_type=BUILD_JOB_FAILED, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.SUCCEEDED: auditor.record(event_type=BUILD_JOB_SUCCEEDED, instance=job, previous_status=previous_status) # Check if we need to schedule a job stop if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED): _logger.info( 'The build job `%s` failed or is done, ' 'send signal to stop.', job.unique_name) # Schedule stop for this job celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={ 'project_name': job.project.unique_name, 'project_uuid': job.project.uuid.hex, 'build_job_name': job.unique_name, 'build_job_uuid': job.uuid.hex, 'update_status': False, 'collect_logs': True, }, countdown=RedisTTL.get_for_build(build_id=job.id)) # handle done status if JobLifeCycle.is_done(instance.status): auditor.record(event_type=BUILD_JOB_DONE, instance=job, previous_status=previous_status) celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': job.id})
def job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance set_job_started_at(instance=job, status=instance.status) set_job_finished_at(instance=job, status=instance.status) job.save(update_fields=['status']) auditor.record(event_type=JOB_NEW_STATUS, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=JOB_STOPPED, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.FAILED: auditor.record(event_type=JOB_FAILED, instance=job, previous_status=previous_status) if instance.status == JobLifeCycle.SUCCEEDED: auditor.record(event_type=JOB_SUCCEEDED, instance=job, previous_status=previous_status) if JobLifeCycle.is_done(instance.status): auditor.record(event_type=JOB_DONE, instance=job, previous_status=previous_status) # Check if we need to schedule a job stop if not job.specification: return if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED): _logger.debug('The build job `%s` failed or is done, ' 'send signal to stop.', job.unique_name) # Schedule stop for this job because celery_app.send_task( SchedulerCeleryTasks.JOBS_STOP, kwargs={ 'project_name': job.project.unique_name, 'project_uuid': job.project.uuid.hex, 'job_name': job.unique_name, 'job_uuid': job.uuid.hex, 'specification': job.config, 'update_status': False }, countdown=RedisTTL.get_for_job(job_id=job.id))
def build_job_status_post_save(sender, **kwargs): instance = kwargs['instance'] job = instance.job previous_status = job.last_status # Update job last_status job.status = instance job.save() auditor.record(event_type=BUILD_JOB_NEW_STATUS, instance=job, previous_status=previous_status, target='project') if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=BUILD_JOB_STOPPED, instance=job, previous_status=previous_status, target='project') if instance.status == JobLifeCycle.FAILED: auditor.record(event_type=BUILD_JOB_FAILED, instance=job, previous_status=previous_status, target='project') if instance.status == JobLifeCycle.STOPPED: auditor.record(event_type=BUILD_JOB_SUCCEEDED, instance=job, previous_status=previous_status, target='project') # Check if we need to schedule a job stop if instance.status in (JobLifeCycle.FAILED, JobLifeCycle.SUCCEEDED): _logger.info( 'The build job `%s` failed or is done, ' 'send signal to stop.', job.unique_name) # Schedule stop for this job celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={ 'project_name': job.project.unique_name, 'project_uuid': job.project.uuid.hex, 'build_job_name': job.unique_name, 'build_job_uuid': job.uuid.hex, 'update_status': False }) # handle done status if JobLifeCycle.is_done(instance.status): celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_NOTIFY_DONE, kwargs={'build_job_id': job.id})
async def job_logs( request, # pylint:disable=too-many-branches ws, username, project_name, job_id): job, message = validate_job(request=request, username=username, project_name=project_name, job_id=job_id) if job is None: await ws.send(get_error_message(message)) return job_uuid = job.uuid.hex auditor.record(event_type=JOB_LOGS_VIEWED, instance=job, actor_id=request.app.user.id, actor_name=request.app.user.username) if not RedisToStream.is_monitored_job_logs(job_uuid=job_uuid): logger.info('Job uuid `%s` logs is now being monitored', job_uuid) RedisToStream.monitor_job_logs(job_uuid=job_uuid) # start consumer if job_uuid in request.app.job_logs_consumers: consumer = request.app.job_logs_consumers[job_uuid] else: logger.info('Add job log consumer for %s', job_uuid) consumer = Consumer( routing_key='{}.{}'.format(RoutingKeys.STREAM_LOGS_SIDECARS_JOBS, job_uuid), queue='{}.{}'.format(CeleryQueues.STREAM_LOGS_SIDECARS, job_uuid)) request.app.job_logs_consumers[job_uuid] = consumer consumer.run() def should_disconnect(): if not consumer.ws: logger.info('Stopping logs monitor for job uuid %s', job_uuid) RedisToStream.remove_job_logs(job_uuid=job_uuid) # if job_uuid in request.app.job_logs_consumers: # consumer = request.app.job_logs_consumers.pop(job_uuid, None) # if consumer: # consumer.stop() return True return False # add socket manager consumer.add_socket(ws) should_quite = False num_message_retries = 0 # Stream phase changes status = None while status != JobLifeCycle.RUNNING and not JobLifeCycle.is_done(status): job.refresh_from_db() if status != job.last_status: status = job.last_status await notify(consumer=consumer, message=get_status_message(status)) if should_disconnect(): return await asyncio.sleep(SOCKET_SLEEP) if JobLifeCycle.is_done(status): await notify(consumer=consumer, message=get_status_message(status)) RedisToStream.remove_job_logs(job_uuid=job_uuid) return while True: num_message_retries += 1 for message in consumer.get_messages(): num_message_retries = 0 await notify(consumer=consumer, message=message) # After trying a couple of time, we must check the status of the experiment if num_message_retries > MAX_RETRIES: job.refresh_from_db() if job.is_done: logger.info('removing all socket because the job `%s` is done', job_uuid) consumer.ws = set([]) else: num_message_retries -= CHECK_DELAY # Just to check if connection closed if ws._connection_lost: # pylint:disable=protected-access logger.info('Quitting logs socket for job uuid %s', job_uuid) consumer.remove_sockets({ ws, }) should_quite = True if should_disconnect(): should_quite = True if should_quite: return await asyncio.sleep(SOCKET_SLEEP)