def __init__(self, parent, backend, job_id, name): logger.debug( 'Starting Job instance constructor with name {0}'.format(name)) super(Job, self).__init__() self.parent = parent self.backend = backend self.event_handler = self.parent.event_handler self.job_id = job_id self.name = name self.state = JobState() # tasks themselves aren't hashable, so we need a secondary lookup self.tasks = {} self.next_run = None self.cron_schedule = None self.cron_iter = None self.run_log = None self.completion_lock = threading.Lock() self.notes = None self.snapshot = None self._set_status('waiting') self.commit()
def _complete_task(self, task_name, **kwargs): """ Marks this task as completed. Kwargs are stored in the run log. """ logger.debug('Job {0} marking task {1} as completed'.format( self.name, task_name)) self.run_log['tasks'][task_name] = kwargs for node in self.downstream(task_name, self.snapshot): self._start_if_ready(node) try: self.backend.acquire_lock() self._commit_run_log() except: logger.exception("Error in handling events.") finally: self.backend.release_lock() if kwargs.get('success', None) == False: task = self.tasks[task_name] try: self.backend.acquire_lock() if self.event_handler: self.event_handler.emit( 'task_failed', task._serialize(include_run_logs=True)) except: logger.exception("Error in handling events.") finally: self.backend.release_lock() self._on_completion()
def delete(self): """ Delete this Dagobah instance from the Backend. """ logger.debug('Deleting Dagobah instance with ID {0}'.format( self.dagobah_id)) self.jobs = [] self.created_jobs = 0 self.backend.delete_dagobah(self.dagobah_id)
def schedule(self, cron_schedule, base_datetime=None): """ Schedules the job to run periodically using Cron syntax. """ logger.debug('Scheduling job {0} with cron schedule {1}'.format( self.name, cron_schedule)) if not self.state.allow_change_schedule: raise DagobahError( "job's schedule cannot be changed in state: %s" % self.state.status) if cron_schedule is None: self.cron_schedule = None self.cron_iter = None self.next_run = None else: if base_datetime is None: base_datetime = datetime.utcnow() self.cron_schedule = cron_schedule self.cron_iter = croniter(cron_schedule, base_datetime) self.next_run = self.cron_iter.get_next(datetime) logger.debug('Determined job {0} next run of {1}'.format( self.name, self.next_run)) self.commit()
def start(self): """ Begins the job by kicking off all tasks with no dependencies. """ logger.info('Job {0} starting job run'.format(self.name)) if not self.state.allow_start: raise DagobahError('job cannot be started in its current state; ' + 'it is probably already running') self.initialize_snapshot() # don't increment if the job was run manually if self.cron_iter and datetime.utcnow() > self.next_run: self.next_run = self.cron_iter.get_next(datetime) self.run_log = { 'job_id': self.job_id, 'name': self.name, 'parent_id': self.parent.dagobah_id, 'log_id': self.backend.get_new_log_id(), 'start_time': datetime.utcnow(), 'tasks': {} } self._set_status('running') logger.debug('Job {0} resetting all tasks prior to start'.format( self.name)) for task in self.tasks.values(): task.reset() logger.debug('Job {0} seeding run logs'.format(self.name)) for task_name in self.ind_nodes(self.snapshot): self._put_task_in_run_log(task_name) self.tasks[task_name].start() self._commit_run_log()
def _task_complete(self, **kwargs): """ Performs cleanup tasks and notifies Job that the Task finished. """ logger.debug('Running _task_complete for task {0}'.format(self.name)) with self.parent_job.completion_lock: self.completed_at = datetime.utcnow() self.successful = kwargs.get('success', None) self.parent_job._complete_task(self.name, **kwargs)
def __init__(self, parent_job, command, name, soft_timeout=0, hard_timeout=0, hostname=None): logger.debug('Starting Task instance constructor with name {0}'.format(name)) self.parent_job = parent_job self.backend = self.parent_job.backend self.event_handler = self.parent_job.event_handler self.command = command self.name = name self.hostname = hostname self.remote_channel = None self.process = None self.stdout = "" self.stderr = "" self.stdout_file = None self.stderr_file = None self.timer = None self.started_at = None self.completed_at = None self.successful = None self.terminate_sent = False self.kill_sent = False self.remote_failure = False self.set_soft_timeout(soft_timeout) self.set_hard_timeout(hard_timeout) self.parent_job.commit()
def check_complete(self): """ Runs completion flow for this task if it's finished. """ logger.debug('Running check_complete for task {0}'.format(self.name)) # Tasks not completed if self.remote_not_complete() or self.local_not_complete(): self._start_check_timer() return return_code = self.completed_task() # Handle task errors if self.terminate_sent: self.stderr += '\nDAGOBAH SENT SIGTERM TO THIS PROCESS\n' if self.kill_sent: self.stderr += '\nDAGOBAH SENT SIGKILL TO THIS PROCESS\n' if self.remote_failure: return_code = -1 self.stderr += '\nAn error occurred with the remote machine.\n' self.stdout_file = None self.stderr_file = None self._task_complete(success=True if return_code == 0 else False, return_code=return_code, stdout=self.stdout, stderr=self.stderr, start_time=self.started_at, complete_time=datetime.utcnow())
def update_job_notes(self, notes): logger.debug('Job {0} updating notes'.format(self.name)) if not self.state.allow_edit_job: raise DagobahError('job cannot be edited in its current state') setattr(self, 'notes', notes) self.parent.commit(cascade=True)
def from_backend(self, dagobah_id): """ Reconstruct this Dagobah instance from the backend. """ logger.debug('Reconstructing Dagobah instance from backend with ID {0}'.format(dagobah_id)) rec = self.backend.get_dagobah_json(dagobah_id) if not rec: raise DagobahError('dagobah with id %s does not exist ' 'in backend' % dagobah_id) self._construct_from_json(rec)
def _put_task_in_run_log(self, task_name): """ Initializes the run log task entry for this task. """ logger.debug('Job {0} initializing run log entry for task {1}'.format( self.name, task_name)) data = { 'start_time': datetime.utcnow(), 'command': self.tasks[task_name].command } self.run_log['tasks'][task_name] = data
def commit(self, cascade=False): """ Commit this Dagobah instance to the backend. If cascade is True, all child Jobs are commited as well. """ logger.debug('Committing Dagobah instance with cascade={0}'.format(cascade)) self.backend.commit_dagobah(self._serialize()) if cascade: [job.commit() for job in self.jobs]
def delete_job(self, job_name): """ Delete a job by name, or error out if no such job exists. """ logger.debug('Deleting job {0}'.format(job_name)) for idx, job in enumerate(self.jobs): if job.name == job_name: self.backend.delete_job(job.job_id) del self.jobs[idx] self.commit() return raise DagobahError('no job with name %s exists' % job_name)
def _timeout_check(self): logger.debug('Running timeout check for task {0}'.format(self.name)) if (self.soft_timeout != 0 and (datetime.utcnow() - self.started_at).seconds >= self.soft_timeout and not self.terminate_sent): self.terminate() if (self.hard_timeout != 0 and (datetime.utcnow() - self.started_at).seconds >= self.hard_timeout and not self.kill_sent): self.kill()
def add_job_from_json(self, job_json, destructive=False): """ Construct a new Job from an imported JSON spec. """ logger.debug('Importing job from JSON document: {0}'.format(job_json)) rec = self.backend.decode_import_json(job_json) if destructive: try: self.delete_job(rec['name']) except DagobahError: # expected if no job with this name pass self._add_job_from_spec(rec, use_job_id=False) self.commit(cascade=True)
def add_dependency(self, from_task_name, to_task_name): """ Add a dependency between two tasks. """ logger.debug('Adding dependency from {0} to {1}'.format( from_task_name, to_task_name)) if not self.state.allow_change_graph: raise DagobahError( "job's graph is immutable in its current state: %s" % self.state.status) self.add_edge(from_task_name, to_task_name) self.commit()
def _start_if_ready(self, task_name): """ Start this task if all its dependencies finished successfully. """ logger.debug('Job {0} running _start_if_ready for task {1}'.format( self.name, task_name)) task = self.tasks[task_name] dependencies = self._dependencies(task_name, self.snapshot) for dependency in dependencies: if self.run_log['tasks'].get(dependency, {}).get('success', False) == True: continue return self._put_task_in_run_log(task_name) task.start()
def add_job(self, job_name, job_id=None): """ Create a new, empty Job. """ logger.debug('Creating a new job named {0}'.format(job_name)) if not self._name_is_available(job_name): raise DagobahError('name %s is not available' % job_name) if not job_id: job_id = self.backend.get_new_job_id() self.created_jobs += 1 self.jobs.append(Job(self, self.backend, job_id, job_name)) job = self.get_job(job_name) job.commit()
def __init__(self, backend=BaseBackend(), event_handler=None, ssh_config=None): """ Construct a new Dagobah instance with a specified Backend. """ logger.debug('Starting Dagobah instance constructor') self.backend = backend self.event_handler = event_handler self.dagobah_id = self.backend.get_new_dagobah_id() self.jobs = [] self.created_jobs = 0 self.scheduler = Scheduler(self) self.scheduler.daemon = True self.ssh_config = ssh_config self.scheduler.start() self.commit()
def initialize_snapshot(self): """ Copy the DAG and validate """ logger.debug('Initializing DAG snapshot for job {0}'.format(self.name)) if self.snapshot is not None: logger.warn("Attempting to initialize DAG snapshot without " + "first destroying old snapshot.") snapshot_to_validate = deepcopy(self.graph) is_valid, reason = self.validate(snapshot_to_validate) if not is_valid: raise DagobahError(reason) self.snapshot = snapshot_to_validate
def delete_task(self, task_name): """ Deletes the named Task in this Job. """ logger.debug('Deleting task {0}'.format(task_name)) if not self.state.allow_change_graph: raise DagobahError( "job's graph is immutable in its current state: %s" % self.state.status) if task_name not in self.tasks: raise DagobahError('task %s does not exist' % task_name) self.tasks.pop(task_name) self.delete_node(task_name) self.commit()
def add_task(self, command, name=None, **kwargs): """ Adds a new Task to the graph with no edges. """ logger.debug('Adding task with command {0} to job {1}'.format( command, self.name)) if not self.state.allow_change_graph: raise DagobahError( "job's graph is immutable in its current state: %s" % self.state.status) if name is None: name = command new_task = Task(self, command, name, **kwargs) self.tasks[name] = new_task self.add_node(name) self.commit()
def reset(self): """ Reset this Task to a clean state prior to execution. """ logger.debug('Resetting task {0}'.format(self.name)) self.stdout_file = tempfile.TemporaryFile() self.stderr_file = tempfile.TemporaryFile() self.stdout = "" self.stderr = "" self.started_at = None self.completed_at = None self.successful = None self.terminate_sent = False self.kill_sent = False self.remote_failure = False
def add_task_to_job(self, job_or_job_name, task_command, task_name=None, **kwargs): """ Add a task to a job owned by the Dagobah instance. """ if isinstance(job_or_job_name, Job): job = job_or_job_name else: job = self.get_job(job_or_job_name) if not job: raise DagobahError('job %s does not exist' % job_or_job_name) logger.debug('Adding task with command {0} to job {1}'.format(task_command, job.name)) if not job.state.allow_change_graph: raise DagobahError("job's graph is immutable in its current " + "state: %s" % job.state.status) job.add_task(task_command, task_name, **kwargs) job.commit()
def edit_task(self, task_name, **kwargs): """ Change the name of a Task owned by this Job. This will affect the historical data available for this Task, e.g. past run logs will no longer be accessible. """ logger.debug('Job {0} editing task {1}'.format(self.name, task_name)) if not self.state.allow_edit_task: raise DagobahError("tasks cannot be edited in this job's " + "current state") if task_name not in self.tasks: raise DagobahError('task %s not found' % task_name) if 'name' in kwargs and isinstance(kwargs['name'], str): if kwargs['name'] in self.tasks: raise DagobahError('task name %s is unavailable' % kwargs['name']) task = self.tasks[task_name] for key in ['name', 'command']: if key in kwargs and isinstance(kwargs[key], str): setattr(task, key, kwargs[key]) if 'soft_timeout' in kwargs: task.set_soft_timeout(kwargs['soft_timeout']) if 'hard_timeout' in kwargs: task.set_hard_timeout(kwargs['hard_timeout']) if 'hostname' in kwargs: task.set_hostname(kwargs['hostname']) if 'name' in kwargs and isinstance(kwargs['name'], str): self.rename_edges(task_name, kwargs['name']) self.tasks[kwargs['name']] = task del self.tasks[task_name] self.parent.commit(cascade=True)
def edit(self, **kwargs): """ Change this Job's name. This will affect the historical data available for this Job, e.g. past run logs will no longer be accessible. """ logger.debug('Job {0} changing name to {1}'.format( self.name, kwargs.get('name'))) if not self.state.allow_edit_job: raise DagobahError('job cannot be edited in its current state') if 'name' in kwargs and isinstance(kwargs['name'], str): if not self.parent._name_is_available(kwargs['name']): raise DagobahError('new job name %s is not available' % kwargs['name']) for key in ['name']: if key in kwargs and isinstance(kwargs[key], str): setattr(self, key, kwargs[key]) self.parent.commit(cascade=True)
def retry(self): """ Restarts failed tasks of a job. """ logger.info('Job {0} retrying all failed tasks'.format(self.name)) self.initialize_snapshot() failed_task_names = [] for task_name, log in self.run_log['tasks'].items(): if log.get('success', True) == False: failed_task_names.append(task_name) if len(failed_task_names) == 0: raise DagobahError('no failed tasks to retry') self._set_status('running') self.run_log['last_retry_time'] = datetime.utcnow() logger.debug('Job {0} seeding run logs'.format(self.name)) for task_name in failed_task_names: self._put_task_in_run_log(task_name) self.tasks[task_name].start() self._commit_run_log()
def _on_completion(self): """ Checks to see if the Job has completed, and cleans up if it has. """ logger.debug('Job {0} running _on_completion check'.format(self.name)) if self.state.status != 'running' or (not self._is_complete()): return for job, results in self.run_log['tasks'].items(): if results.get('success', False) == False: self._set_status('failed') try: self.backend.acquire_lock() if self.event_handler: self.event_handler.emit( 'job_failed', self._serialize(include_run_logs=True)) except: logger.exception("Error in handling events.") finally: self.backend.release_lock() break if self.state.status != 'failed': self._set_status('waiting') self.run_log = {} try: self.backend.acquire_lock() if self.event_handler: self.event_handler.emit( 'job_complete', self._serialize(include_run_logs=True)) except: logger.exception("Error in handling events.") finally: self.backend.release_lock() self.destroy_snapshot()
def commit(self): """ Store metadata on this Job to the backend. """ logger.debug('Committing job {0}'.format(self.name)) self.backend.commit_job(self._serialize()) self.parent.commit()
def destroy_snapshot(self): """ Destroy active copy of the snapshot """ logger.debug('Destroying DAG snapshot for job {0}'.format(self.name)) self.snapshot = None