def _handle_terminating_task(self, task_id): keyt = 'task:%s' % task_id _, service = self._get_service(keyt=keyt) data = self._redis.hget(keyt, 'job') nxpus = Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus')) if data is not None: container_id = self._redis.hget(keyt, 'container_id') data = json.loads(data) data['container_id'] = container_id self._logger.info('%s: terminating task (job: %s)', task_id, json.dumps(data)) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) self._logger.info(traceback.format_exc()) else: self._logger.info('%s: terminating task (on error)', task_id) resource = self._redis.hget(keyt, 'alloc_resource') if resource: self._release_resource(service, resource, task_id, nxpus) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)
def _advance_task(self, task_id): """Tries to advance the task to the next status. If it can, re-queue it immediately to process the next stage. Otherwise, re-queue it after some delay to try again. """ keyt = 'task:%s' % task_id with self._redis.acquire_lock(keyt, acquire_timeout=1, expire_time=600): status = self._redis.hget(keyt, 'status') if status == 'stopped': return service_name = self._redis.hget(keyt, 'service') if service_name not in self._services: raise ValueError('unknown service %s' % service_name) service = self._services[service_name] self._logger.info('%s: trying to advance from status %s', task_id, status) if status == 'queued': resource = self._redis.hget(keyt, 'resource') parent = self._redis.hget(keyt, 'parent') if parent: keyp = 'task:%s' % parent # if the parent task is in the database, check for dependencies if self._redis.exists(keyp): status = self._redis.hget(keyp, 'status') if status == 'stopped': if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, task_id, phase='dependency_error') return else: self._logger.warning('%s: depending on other task, waiting', task_id) task.service_queue(self._redis, task_id, service.name) return ngpus = int(self._redis.hget(keyt, 'ngpus')) resource, available_gpus = self._allocate_resource(task_id, resource, service, ngpus) if resource is not None: self._logger.info('%s: resource %s reserved (%d/%d)', task_id, resource, available_gpus, ngpus) self._redis.hset(keyt, 'alloc_resource', resource) if ngpus == available_gpus: task.set_status(self._redis, keyt, 'allocated') else: task.set_status(self._redis, keyt, 'allocating') task.work_queue(self._redis, task_id, service_name) else: self._logger.warning('%s: no resources available, waiting', task_id) task.service_queue(self._redis, task_id, service.name) elif status == 'allocating': resource = self._redis.hget(keyt, 'alloc_resource') keyr = 'resource:%s:%s' % (service.name, resource) ngpus = int(self._redis.hget(keyt, 'ngpus')) already_allocated_gpus = 0 for k, v in six.iteritems(self._redis.hgetall(keyr)): if v == task_id: already_allocated_gpus += 1 capacity = service.list_resources()[resource] available_gpus, remaining_gpus = self._reserve_resource(service, resource, capacity, task_id, ngpus - already_allocated_gpus, 0, -1, True) self._logger.warning('task: %s - resource: %s (capacity %d)- already %d - available %d', task_id, resource, capacity, already_allocated_gpus, available_gpus) if available_gpus == ngpus - already_allocated_gpus: task.set_status(self._redis, keyt, 'allocated') key_reserved = 'reserved:%s:%s' % (service.name, resource) self._redis.delete(key_reserved) task.work_queue(self._redis, task_id, service.name) else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'allocated': content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'alloc_resource') self._logger.info('%s: launching on %s', task_id, service.name) try: keyr = 'resource:%s:%s' % (service.name, resource) lgpu = [] for k, v in six.iteritems(self._redis.hgetall(keyr)): if v == task_id: lgpu.append(k) self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu)) data = service.launch( task_id, content['options'], lgpu, resource, content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._redis, task_id), content['wait_after_launch']) except EnvironmentError as e: # the resource is not available and will be set busy self._block_resource(resource, service, str(e)) # set the task as queued again self._redis.hdel(keyt, 'alloc_resource') self._release_resource(service, resource, task_id) task.set_status(self._redis, keyt, 'queued') task.service_queue(self._redis, task_id, service.name) self._logger.info('could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource) return except Exception as e: # all other errors make the task fail task.append_log(self._redis, task_id, str(e)) task.terminate(self._redis, task_id, phase='launch_error') return self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'running': self._logger.debug('- checking activity of task: %s', task_id) data = json.loads(self._redis.hget(keyt, 'job')) status = service.status(task_id, data) if status == 'dead': self._logger.info('%s: task no longer running on %s, request termination', task_id, service.name) task.terminate(self._redis, task_id, phase='exited') else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'terminating': data = self._redis.hget(keyt, 'job') if data is not None: container_id = self._redis.hget(keyt, 'container_id') data = json.loads(data) data['container_id'] = container_id self._logger.info('%s: terminating task (%s)', task_id, json.dumps(data)) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) resource = self._redis.hget(keyt, 'alloc_resource') self._release_resource(service, resource, task_id) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)
def _advance_task(self, task_id): """Tries to advance the task to the next status. If it can, re-queue it immediately to process the next stage. Otherwise, re-queue it after some delay to try again. """ keyt = 'task:%s' % task_id with self._redis.acquire_lock(keyt, acquire_timeout=1, expire_time=600): status = self._redis.hget(keyt, 'status') if status == 'stopped': return service_name = self._redis.hget(keyt, 'service') if service_name not in self._services: raise ValueError('unknown service %s' % service_name) service = self._services[service_name] self._logger.info('%s: trying to advance from status %s', task_id, status) if status == 'queued': resource = self._redis.hget(keyt, 'resource') resource = self._allocate_resource(task_id, resource, service) if resource is not None: self._logger.info('%s: resource %s reserved', task_id, resource) self._redis.hset(keyt, 'resource', resource) task.set_status(self._redis, keyt, 'allocated') task.queue(self._redis, task_id) else: self._logger.warning('%s: no resources available, waiting', task_id) self._wait_for_resource(service, task_id) elif status == 'allocated': content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'resource') self._logger.info('%s: launching on %s', task_id, service.name) data = service.launch(task_id, content['options'], resource, content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._redis, task_id), content['wait_after_launch']) self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.queue(self._redis, task_id, delay=service.is_notifying_activity and 120 or 30) elif status == 'running': data = json.loads(self._redis.hget(keyt, 'job')) status = service.status(data) if status == 'dead': self._logger.info( '%s: task no longer running on %s, request termination', task_id, service.name) task.terminate(self._redis, task_id, phase='exited') else: task.queue(self._redis, task_id, delay=service.is_notifying_activity and 120 or 30) elif status == 'terminating': data = self._redis.hget(keyt, 'job') if data is not None: data = json.loads(data) self._logger.info('%s: terminating task', task_id) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) resource = self._redis.hget(keyt, 'resource') self._release_resource(service, resource, task_id) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)