class JobRunner: def __init__(self, client: DockerWrapper, job_queue: JobQueue): self._docker = client self._job_queue = job_queue # Not sure I'm a fan of this, probably need another refactor in the future job_queue.run_signal = self._run_tasks self._logger = LogManager(__name__) def create_new_job(self, image_name, callback, tasks): self._log_operation( 'Creating new job with image {img} and callback {cb}'.format( img=image_name, cb=callback)) identifier = ulid.new().str self._job_queue.add_new_job(identifier, image_name, callback, tasks) self._run_tasks() return identifier def complete_task(self, identifier: str, task_name: str, status: int, result: dict): """ Signal that a task run has been completed :param identifier: The unique job identifier :param task_name: The name of the individual task :param status: The exit status of the task :param result: The output from the task as a dict with 'stdout' and 'stderr' fields where appropriate """ self._log_operation( 'Completing task {tn} in job {i} with status {s} and result:\n{r}'. format(tn=task_name, i=identifier, s=status, r=json.dumps(result))) services, run_more = self._job_queue.complete_task( identifier, task_name, status, result) self._docker.remove_service(services) if run_more: self._run_tasks() def get_job(self, identifier: str): """ Retrieve details about a given job :param identifier: The unique job identifier :return: The job details, if it exists """ self._log_operation('Getting job {i}'.format(i=identifier)) return self._job_queue.get_job_details(identifier) def _run_tasks(self): """ Query the job queue for more jobs to run """ next_tasks = self._job_queue.get_next_tasks() for task in next_tasks: sid = self._docker.start_task(task.identifier, task.image, task.name, task.args) self._job_queue.mark_task_started(task.identifier, task.name, sid) def _log_operation(self, message): self._logger.info('JobRunner: {msg}'.format(msg=message))
class DockerWrapper: DOCKER_RESTART_POLICY = RestartPolicy(condition='none') def __init__(self, client: docker.DockerClient, config: RunnerConfig, authenticator: AuthenticationFactory): self._client = client self._config = config self._authenticator = authenticator self._logger = LogManager(__name__) def start_task(self, job_id: str, image: str, task_name: str, task_args: Iterable[str]) -> int: self._logger.info('Starting task {tn} for job {ji}'.format( tn=task_name, ji=job_id)) run_env = [ 'SWARMER_ADDRESS=http://{addr}:{port}/result/{ident}'.format( addr=self._config.host, port=self._config.port, ident=job_id), 'TASK_NAME={task}'.format(task=task_name), 'SWARMER_JOB_ID={ident}'.format(ident=job_id) ] if any(task_args): run_env += [ 'RUN_ARGS={args}'.format( args=','.join([str(a) for a in task_args])) ] svc = self._get_client().services.create( image, env=run_env, restart_policy=self.DOCKER_RESTART_POLICY, networks=[self._config.network], name='{id}-{name}'.format(id=job_id, name=task_name)) return svc.id def remove_service(self, service_ids: Iterable[int]): for sid in service_ids: svc = self._client.services.get(sid) if svc: svc.remove() def _get_client(self): if self._authenticator and self._authenticator.any_require_login: self._authenticator.perform_logins(self._client) return self._client
class AuthenticationFactory: EXTRAS_KEY = 'swarmer.credentials' PROVIDER_KEY = 'provider' LAST_LOGIN_KEY = 'last_login' def __init__(self): self._providers = dict() self._logger = LogManager(__name__) self._setup_providers() @property def has_providers(self) -> bool: return any(self._providers) @property def any_require_login(self) -> bool: return False if not self.has_providers else any( [p for p in self._providers.values() if p[self.PROVIDER_KEY].should_authenticate(p[self.LAST_LOGIN_KEY])]) def perform_logins(self, client: DockerClient): self._logger.info('Running logins for docker client') if not self.has_providers: self._logger.info('No providers present, skipping...') return for entry in self._providers.values(): provider = entry[self.PROVIDER_KEY] if provider.should_authenticate(entry[self.LAST_LOGIN_KEY]): (user, password, registry) = provider.obtain_auth() client.login(username=user, password=password, registry=registry) entry['last_login'] = datetime.now() def _setup_providers(self): for entry_point in iter_entry_points(self.EXTRAS_KEY): self._logger.info('Loading authentication providers') try: provider = entry_point.load() provider_instance = provider() self._providers[entry_point.name] = {self.PROVIDER_KEY: provider_instance, self.LAST_LOGIN_KEY: None} except DistributionNotFound: # It may be the case that we were not asked to enable/include # a particular provider. This is ok and will simply default # to either: # 1) Only use the enabled ones, or # 2) Only have the ability to fetch from public registries self._logger.info( "It appears that the feature {feat} was not enabled, skipping".format(feat=entry_point.name))
class JobDb: """ The JobDb is responsible for handling the redis job tracking """ def __init__(self, rd: redis.StrictRedis): self._redis = rd self._logger = LogManager(__name__) def add_job(self, identifier: str, image_name: str, callback: str): """ Add a new job to the tracking database :param identifier: The unique job identifier :param image_name: The name of the image that is used to run each job :param callback: The URL to POST back all results """ self._log_operation('Adding new job {i}'.format(i=identifier)) initial_state = { '__image': image_name, '__callback': callback, 'tasks': [] } self._redis.hmset(identifier, initial_state) def add_job_with_tasks(self, identifier: str, image_name: str, callback: str, tasks: list): self.add_job(identifier, image_name, callback) self.add_tasks(identifier, tasks) def add_tasks(self, identifier: str, tasks: list): """ Add a list of tasks to the given job, when all tasks are complete, the job is considered finished. :param identifier: The unique job identifier :param tasks: A list of task objects """ self._log_operation('Adding tasks to job {i}:\n{t}'.format( i=identifier, t=json.dumps(tasks))) if not self._redis.exists(identifier): raise ValueError('Can not find item with identifier: {id}'.format( id=identifier)) task_dict = { 'tasks': json.dumps([{ 'args': t['task_args'], 'status': 500, 'result': { 'stdout': None, 'stderr': None }, 'name': t['task_name'] } for t in tasks]) } self._redis.hmset(identifier, task_dict) def update_status(self, identifier: str, task_name: str, status: int): """ Update the status of a run :param identifier: The unique job identifier :param task_name: The individual task name to update the status of :param status: The exit status of the task """ self._log_operation( 'Updating status of task {tn} for job {i} with {s}'.format( tn=task_name, i=identifier, s=status)) task = self._get_task(identifier, task_name) task['status'] = status task_list = self._get_task_list(identifier) update = [task if t['name'] == task['name'] else t for t in task_list] self._redis.hset(identifier, 'tasks', json.dumps(update)) def update_result(self, identifier: str, task_name: str, result: dict): """ Update the result of a task run :param identifier: The unique job identifier :param task_name: The individual task name :param result: A dict with the stdout and stderr output, if any was present """ self._log_operation( 'Updating result of task {tn} for job {i} with {res}'.format( tn=task_name, i=identifier, res=json.dumps(result))) task = self._get_task(identifier, task_name) task['result'] = result task_list = self._get_task_list(identifier) update = [task if t['name'] == task['name'] else t for t in task_list] self._redis.hset(identifier, 'tasks', json.dumps(update)) def get_job(self, identifier: str): """ Retrieve the tracking dict for the given job :param identifier: The unique job identifier """ self._log_operation('Getting job {i}'.format(i=identifier)) if not self._redis.exists(identifier): raise ValueError( 'Can not find job with id: {id}'.format(id=identifier)) job = self._redis.hgetall(identifier) def check_and_decode(value): try: return value.decode('utf-8') except (ValueError, AttributeError): return value return { check_and_decode(k): check_and_decode(v) for k, v in job.items() } def get_task(self, identifier: str, task_name: str): """ Retrieve the status for an individual run in a job :param identifier: The unique job identifier :param task_name: The name of the individual job """ self._log_operation('Getting task {tn} from job {i}'.format( tn=task_name, i=identifier)) return self._get_task(identifier, task_name) def get_tasks(self, identifier: str): """ Get the list of tasks for the specified job :param identifier: The unique job identifier :returns: The list of all tasks related to the specified job """ self._log_operation('Getting tasks for {i}'.format(i=identifier)) return self._get_task_list(identifier) def set_task_id(self, identifier: str, task_name: str, task_id: str): """ Set the docker service identifier for the task :param identifier: The unique job identifier :param task_name: The name of the individual task :param task_id: The id of the task service """ self._log_operation( 'Setting task id {ti} for task {tn} for job {i}'.format( ti=task_id, tn=task_name, i=identifier)) task = self._get_task(identifier, task_name) task['__task_id'] = task_id task_list = self._get_task_list(identifier) update = [task if t['name'] == task['name'] else t for t in task_list] self._redis.hset(identifier, 'tasks', json.dumps(update)) def clear_job(self, identifier: str): """ Remove an entire job from the tracking DB :param identifier: The unique job identifier """ self._log_operation('Clearing job {i}'.format(i=identifier)) if not self._redis.exists(identifier): raise ValueError( 'Can not find job with id: {id}'.format(id=identifier)) self._redis.delete(identifier) def _get_task(self, identifier, name): self._log_operation('Retrieving task {t} for {i}'.format(t=name, i=identifier)) if not self._redis.hexists(identifier, 'tasks'): raise ValueError( 'Unable to find job with identifier {id} that has any tasks'. format(id=identifier)) tasks = json.loads(self._redis.hget(identifier, 'tasks')) if not any(t['name'] == name for t in tasks): raise ValueError('Unable to locate task {name} in job {id}'.format( name=name, id=identifier)) val = [t for t in tasks if t['name'] == name] return val[0] def _get_task_list(self, identifier): self._log_operation( 'Retrieving task list for {ident}'.format(ident=identifier)) if not self._redis.hexists(identifier, 'tasks'): raise ValueError( 'Unable to find job with identifier {id} that has any tasks'. format(id=identifier)) return json.loads(self._redis.hget(identifier, 'tasks')) def _log_operation(self, message: str): self._logger.info('JobDb: {msg}'.format(msg=message))
class JobQueue: """ The JobQueue is responsible for interacting with the database and telling the task manager what task(s) to run next. Currently all background jobs to check for dead and completed jobs is in here but it should be moved out in the future """ # We scan for bad tasks every 10 minutes DEAD_SCAN_INTERVAL = 600 # We scan for completed jobs every minute COMPLETED_SCAN_INTERVAL = 60 # For now, anything above 30 minutes is stalled DEAD_JOB_INTERVAL = datetime.timedelta(minutes=30) # If set, we use this to signal that more tasks should be run _run_signal = None def __init__(self, job_db: JobDb, queue_len=12, thread_builder=Thread): self._job_db = job_db self._queue_len = queue_len self._logger = LogManager(__name__) self._tasks = deque() self._running_tasks = [] self._jobs = set() self._lock = Lock() self._overdue_tasks = set() # Set up the cleanup process self._bg_cleanup_thread = thread_builder( target=self._scan_for_dead_jobs, args=()) self._bg_cleanup_thread.daemon = True self._bg_cleanup_thread.start() # Set up the completed job process self._bg_completed_thread = thread_builder( target=self._scan_for_completed_jobs, args=()) self._bg_completed_thread.daemon = True self._bg_completed_thread.start() @property def run_signal(self): return self._run_signal @run_signal.setter def run_signal(self, value): self._run_signal = value def add_new_job(self, identifier, image_name, callback, tasks): """ Add a new job to the job queue :param identifier: The identifier for the job :param image_name: The name of the image to run each task :param callback: The callback URL to report results :param tasks: The individual tasks to run """ if not tasks: self._logger.error( 'No tasks provided when submitting job {i}'.format( i=identifier)) raise ValueError('Tasks must be provided with the job') self._logger.info('Adding job {i} to the queue'.format(i=identifier)) self._job_db.add_job(identifier, image_name, callback) self._jobs.add(identifier) for t in tasks: self._tasks.appendleft( TaskEntry(identifier, t['task_name'], t['task_args'], image_name, None, None)) self._job_db.add_tasks(identifier, tasks) def complete_task(self, identifier, name, status, result) -> (List[int], bool): with self._lock: task = [t for t in self._running_tasks if t.name == name] try: task_id = task[0] self._job_db.update_result(identifier, name, result) self._job_db.update_status(identifier, name, status) # Remove this task from the running tasks self._running_tasks = [ t for t in self._running_tasks if t.name != name ] # Return the task id we had recorded and whether to start any more tasks which # is based on whether we are already running at capacity and whether we have # any more to run. We also use this time to empty out all 'dead' processes task_list = [task_id] while any(self._overdue_tasks): task_list.append(self._overdue_tasks.pop()) return task_list, len( self._running_tasks) < self._queue_len and any(self._tasks) except IndexError: self._logger.error( 'Was expected to find task "{tn}" for job "{jn}" but it was not present' .format(tn=name, jn=identifier)) def get_next_tasks(self) -> List[RunnableTask]: """ Query the queue for the next tasks to run :return: A list of the next tasks to run """ tasks = [] with self._lock: if len(self._running_tasks) >= self._queue_len or not any( self._tasks): return tasks for _ in range(self._queue_len - len(self._running_tasks)): if not any(self._tasks): break next_task = self._tasks.pop() tasks.append( RunnableTask(next_task.identifier, next_task.name, next_task.args, next_task.image)) self._running_tasks.append(next_task) return tasks def mark_task_started(self, identifier, name, task_id): def set_details(entry: TaskEntry): if entry.identifier != identifier or entry.name != name: return entry return TaskEntry(entry.identifier, entry.name, entry.args, entry.image, task_id, datetime.datetime.now()) with self._lock: self._running_tasks = list(map(set_details, self._running_tasks)) def get_started_tasks(self): with self._lock: return list( map( lambda it: { 'id': it.task_id, 'started': it.started }, filter( lambda it: it.task_id is not None and it.started is not None, self._running_tasks))) def get_job_details(self, identifier): job = self._job_db.get_job(identifier) job['tasks'] = json.loads(job['tasks']) return job def _scan_for_dead_jobs(self): def item_filter(entry: TaskEntry): return entry.task_id not in self._overdue_tasks while True: time.sleep(self.DEAD_SCAN_INTERVAL) with self._lock: overdue = [ t for t in self._running_tasks if datetime.datetime.now() - t.started > self.DEAD_JOB_INTERVAL ] for task in overdue: self._overdue_tasks.add(task.task_id) self._tasks.appendleft( TaskEntry(task.identifier, task.name, task.args, task.image, None, None)) self._running_tasks = list( filter(item_filter, self._running_tasks)) self._signal_should_run() def _scan_for_completed_jobs(self): while True: time.sleep(self.COMPLETED_SCAN_INTERVAL) with self._lock: job_details = [] completed = [ jid for jid in self._jobs if not any([ it for it in self._running_tasks if it.identifier == jid ]) and not any([it for it in self._tasks if it.identifier == jid]) ] for item in completed: self._jobs.remove(item) job_details.append(self._job_db.get_job(item)) self._job_db.clear_job(item) _send_job_results(job_details) self._signal_should_run() def _signal_should_run(self): if self._run_signal and len( self._running_tasks) < self._queue_len and any(self._tasks): self._run_signal()