def terminate_all(self): """ Terminate all currently running tasks. """ logger.info('Job {0} terminating all currently running tasks'.format( self.name)) for task in self.tasks.values(): if task.started_at and not task.completed_at: task.terminate()
def kill_all(self): """ Kill all currently running jobs. """ logger.info('Job {0} killing all currently running tasks'.format( self.name)) for task in self.tasks.values(): if task.started_at and not task.completed_at: task.kill()
def start(self): """ Begins the job by kicking off all tasks with no dependencies. """ logger.info('Job {0} starting job run'.format(self.name)) if not self.state.allow_start: raise DagobahError('job cannot be started in its current state; ' + 'it is probably already running') self.initialize_snapshot() # don't increment if the job was run manually if self.cron_iter and datetime.utcnow() > self.next_run: self.next_run = self.cron_iter.get_next(datetime) self.run_log = { 'job_id': self.job_id, 'name': self.name, 'parent_id': self.parent.dagobah_id, 'log_id': self.backend.get_new_log_id(), 'start_time': datetime.utcnow(), 'tasks': {} } self._set_status('running') logger.debug('Job {0} resetting all tasks prior to start'.format( self.name)) for task in self.tasks.values(): task.reset() logger.debug('Job {0} seeding run logs'.format(self.name)) for task_name in self.ind_nodes(self.snapshot): self._put_task_in_run_log(task_name) self.tasks[task_name].start() self._commit_run_log()
def remote_ssh(self, host): """ Execute a command on SSH. Takes a paramiko host dict """ logger.info('Starting remote execution of task {0} on host {1}'.format(self.name, host['hostname'])) try: self.remote_client = paramiko.SSHClient() self.remote_client.load_system_host_keys() self.remote_client.set_missing_host_key_policy( paramiko.AutoAddPolicy()) self.remote_client.connect(host['hostname'], username=host['user'], key_filename=host['identityfile'][0], timeout=82800) transport = self.remote_client.get_transport() transport.set_keepalive(10) self.remote_channel = transport.open_session() self.remote_channel.get_pty() self.remote_channel.exec_command(self.command) except Exception as e: logger.warn('Exception encountered in remote task execution') self.remote_failure = True self.stderr += 'Exception when trying to SSH related to: ' self.stderr += '{0}: {1}\n"'.format(type(e).__name__, str(e)) self.stderr += 'Was looking for host "{0}"\n'.format(str(host)) self.stderr += 'Found in config:\n' self.stderr += 'host: "{0}"\n'.format(str(host)) self.stderr += 'hostname: "{0}"\n'.format(str(host.get('hostname'))) self.stderr += 'user: "******"\n'.format(str(host.get('user'))) self.stderr += 'identityfile: "{0}"\n'.format(str(host.get('identityfile'))) self.remote_client.close()
def kill(self): """ Send SIGKILL to the task's process. """ logger.info('Sending SIGKILL to task {0}'.format(self.name)) if hasattr(self, 'remote_client') and self.remote_client is not None: self.kill_sent = True self.remote_client.close() return if not self.process: raise DagobahError('task does not have a running process') self.kill_sent = True self.process.kill()
def start(self): """ Begin execution of this task. """ logger.info('Starting task {0}'.format(self.name)) self.reset() if self.hostname: host = self.parent_job.parent.get_host(self.hostname) if host: self.remote_ssh(host) else: self.remote_failure = True else: self.process = subprocess.Popen(self.command, shell=True, env=os.environ.copy(), stdout=self.stdout_file, stderr=self.stderr_file) self.started_at = datetime.utcnow() self._start_check_timer()
def retry(self): """ Restarts failed tasks of a job. """ logger.info('Job {0} retrying all failed tasks'.format(self.name)) self.initialize_snapshot() failed_task_names = [] for task_name, log in self.run_log['tasks'].items(): if log.get('success', True) == False: failed_task_names.append(task_name) if len(failed_task_names) == 0: raise DagobahError('no failed tasks to retry') self._set_status('running') self.run_log['last_retry_time'] = datetime.utcnow() logger.debug('Job {0} seeding run logs'.format(self.name)) for task_name in failed_task_names: self._put_task_in_run_log(task_name) self.tasks[task_name].start() self._commit_run_log()