def report_tasks(self, report, toplevel_task_id): task = self.task_graph.get_task(toplevel_task_id) tx = TaskGraphUpdate() for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published) = payload for child in spawned: child_task = build_taskpool_task_from_descriptor(child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.lazy_task_pool.worker_pool.worker_idle(toplevel_task_id.worker) ciel.engine.publish('schedule') return tx.commit(self.task_graph)
def create_job_for_task(self, task_descriptor, job_id=None): if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) # TODO: Remove the global name directory dependency. try: expected_outputs = task_descriptor['expected_outputs'] for output in expected_outputs: self.global_name_directory.create_global_id(task_id, output) except KeyError: try: num_outputs = task_descriptor['num_outputs'] expected_outputs = map(lambda x: self.global_name_directory.create_global_id(task_id), range(0, num_outputs)) except: expected_outputs = [self.global_name_directory.create_global_id()] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, None) job = Job(job_id, task, job_dir) task.job = job self.add_job(job) cherrypy.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def add_task(self, task_descriptor, parent_task_id=None): with self._lock: try: task_id = task_descriptor['task_id'] except: task_id = self.generate_task_id() task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, parent_task_id) self.tasks[task_id] = task add_event = self.new_event(task) add_event["task_descriptor"] = task.as_descriptor(long=True) add_event["action"] = "CREATED" task.check_dependencies(self.global_name_directory) if task.is_blocked(): for global_id in task.blocked_on(): try: self.references_blocking_tasks[global_id].add(task_id) except KeyError: self.references_blocking_tasks[global_id] = set([task_id]) else: task.state = TASK_RUNNABLE self.add_task_to_queues(task) self.events.append(add_event) self.bus.publish('schedule') return task
def create_job_for_task(self, task_descriptor, job_options, job_id=None): with self._lock: if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) task_descriptor['task_id'] = task_id # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) try: expected_outputs = task_descriptor['expected_outputs'] except KeyError: expected_outputs = ['%s:job_output' % job_id] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_descriptor, None) job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options) task.job = job self.add_job(job) ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def allinone_main(options, args): ciel.log = CielLogger() script_filename = args[0] run_id = args[1] if len(args) > 1 else 'allinone' base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-')) ciel.log('Writing block store files to %s' % base_dir, 'ALLINONE', logging.INFO) if options.blockstore is not None: base_dir = options.blockstore else: base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-')) options.blockstore = base_dir block_store = BlockStore(ciel.engine, 'localhost', 8000, base_dir, True) initial_task_descriptor, cont_ref = build_initial_task_descriptor(script_filename, block_store, 'root', 'root_cont', 'root_output') initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None) task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options) try: print run_id, 'SUBMITTED_JOB', now_as_timestamp() result = task_runner.run() print run_id, 'GOT_RESULT', now_as_timestamp() print block_store.retrieve_object_for_ref(result, 'json') except: pass
def report_tasks(self, report, toplevel_task_id): task = self.task_graph.get_task(toplevel_task_id) tx = TaskGraphUpdate() for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published) = payload for child in spawned: child_task = build_taskpool_task_from_descriptor( child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.lazy_task_pool.worker_pool.worker_idle( toplevel_task_id.worker) ciel.engine.publish('schedule') return tx.commit(self.task_graph)
def allinone_main(options, args): ciel.log = CielLogger() script_filename = args[0] run_id = args[1] if len(args) > 1 else "allinone" base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-")) ciel.log("Writing block store files to %s" % base_dir, "ALLINONE", logging.INFO) if options.blockstore is not None: base_dir = options.blockstore else: base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-")) options.blockstore = base_dir block_store = BlockStore(ciel.engine, "localhost", 8000, base_dir, True) initial_task_descriptor, cont_ref = build_initial_task_descriptor( script_filename, block_store, "root", "root_cont", "root_output" ) initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None) task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options) try: print run_id, "SUBMITTED_JOB", now_as_timestamp() result = task_runner.run() print run_id, "GOT_RESULT", now_as_timestamp() print block_store.retrieve_object_for_ref(result, "json", None) except: pass
def spawn_tasks(self, parent_task_id, tasks): parent_task = self.task_graph.get_task(parent_task_id) tx = TaskGraphUpdate() for task_descriptor in tasks: task_object = build_taskpool_task_from_descriptor(task_descriptor, None, parent_task) tx.spawn(task_object) tx.commit(self.task_graph)
def spawn_tasks(self, parent_task_id, tasks): parent_task = self.task_graph.get_task(parent_task_id) tx = TaskGraphUpdate() for task_descriptor in tasks: task_object = build_taskpool_task_from_descriptor( task_descriptor, None, parent_task) tx.spawn(task_object) tx.commit(self.task_graph)
def load_other_tasks_for_job(self, job, journal_file): ''' Process a the task journal for a recovered job. ''' try: while True: record_header = journal_file.read(RECORD_HEADER_STRUCT.size) if len(record_header) != RECORD_HEADER_STRUCT.size: ciel.log.error( 'Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) break record_type, record_length = RECORD_HEADER_STRUCT.unpack( record_header) record_string = journal_file.read(record_length) if len(record_string) != record_length: ciel.log.error( 'Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) break rec = simplejson.loads(record_string, object_hook=json_decode_object_hook) if record_type == 'R': self.task_pool.publish_single_ref(rec['id'], rec['ref'], job, should_journal=False) elif record_type == 'T': task_id = rec['task_id'] parent_task = self.task_pool.get_task_by_id(rec['parent']) task = build_taskpool_task_from_descriptor( rec, parent_task) task.job = job task.parent.children.append(task) ciel.log.error( 'Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False) self.task_pool.add_task(task) else: ciel.log.error( 'Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False) except: ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True) finally: journal_file.close() if job.state == JOB_ACTIVE: ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
def spawn_and_publish(self, spawns, refs, producer=None, taskset=None): producer_task = None if producer is not None: producer_task = self.get_task(producer["task_id"]) taskset = producer_task.taskset upd = TaskGraphUpdate() for spawn in spawns: task_object = build_taskpool_task_from_descriptor(spawn, producer_task, taskset) upd.spawn(task_object) for ref in refs: upd.publish(ref, producer_task) upd.commit(self)
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. # FIXME: Store job options somewhere for recovered job. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}) root_task.job = job if result is not None: with job._lock: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error('Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def spawn_and_publish(self, spawns, refs, producer=None, taskset=None): producer_task = None if producer is not None: producer_task = self.get_task(producer["task_id"]) taskset = producer_task.taskset upd = TaskGraphUpdate() for spawn in spawns: task_object = build_taskpool_task_from_descriptor( spawn, producer_task, taskset) upd.spawn(task_object) for ref in refs: upd.publish(ref, producer_task) upd.commit(self)
def _report_tasks(self, report, toplevel_task, worker): with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) for assigned_worker in root_task.get_workers(): if assigned_worker is worker: self.workers[worker].deassign_task(root_task) else: self.workers[assigned_worker].deassign_task(root_task) assigned_worker.worker_pool.abort_task_on_worker( root_task, assigned_worker) # XXX: Need to abort the task running on other workers. pass for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor( child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references( toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule()
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task_id = root_task_descriptor['task_id'] root_task = build_taskpool_task_from_descriptor(root_task_id, root_task_descriptor, self.task_pool, None) job = Job(job_id, root_task, job_dir) root_task.job = job if result is not None: job.completed(result) self.job_pool.add_job(job) self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) cherrypy.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) cherrypy.log.error('Recovered task %s for job %s' % (root_task_id, job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() cherrypy.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. cherrypy.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def add_task(self, task_descriptor, parent_task=None, job=None): try: task_id = task_descriptor['task_id'] except: task_id = self.generate_task_id() task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, parent_task) task.job = job self.lazy_task_pool.add_task(task, parent_task is None) #add_event = self.new_event(task) #add_event["task_descriptor"] = task.as_descriptor(long=True) #add_event["action"] = "CREATED" #self.events.append(add_event) return task
def _report_tasks(self, report, toplevel_task, worker): with self._lock: tx = TaskGraphUpdate() root_task = self.task_graph.get_task(report[0][0]) for assigned_worker in root_task.get_workers(): if assigned_worker is worker: self.workers[worker].deassign_task(root_task) else: self.workers[assigned_worker].deassign_task(root_task) assigned_worker.worker_pool.abort_task_on_worker(root_task, assigned_worker) # XXX: Need to abort the task running on other workers. pass for (parent_id, success, payload) in report: parent_task = self.task_graph.get_task(parent_id) if success: (spawned, published, profiling) = payload parent_task.set_profiling(profiling) parent_task.set_state(TASK_COMMITTED) self.record_task_stats(parent_task, worker) for child in spawned: child_task = build_taskpool_task_from_descriptor(child, parent_task) tx.spawn(child_task) parent_task.children.append(child_task) for ref in published: tx.publish(ref, parent_task) else: # Only one failed task per-report, at the moment. self.investigate_task_failure(parent_task, payload) self.schedule() return tx.commit(self.task_graph) self.task_graph.reduce_graph_for_references(toplevel_task.expected_outputs) # XXX: Need to remove assigned task from worker(s). self.schedule()
def load_other_tasks_for_job(self, job, journal_file): ''' Process a the task journal for a recovered job. ''' try: while True: record_header = journal_file.read(RECORD_HEADER_STRUCT.size) if len(record_header) != RECORD_HEADER_STRUCT.size: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header) record_string = journal_file.read(record_length) if len(record_string) != record_length: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break rec = simplejson.loads(record_string, object_hook=json_decode_object_hook) if record_type == 'R': job.task_graph.publish(rec['ref']) elif record_type == 'T': task_id = rec['task_id'] parent_task = job.task_graph.get_task(rec['parent']) task = build_taskpool_task_from_descriptor(rec, parent_task) task.job = job task.parent.children.append(task) ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False) job.task_graph.spawn(task) else: ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False) except: ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True) finally: journal_file.close() job.restart_journalling() if job.state == JOB_ACTIVE: ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
def create_job_for_task(self, task_descriptor, job_id=None): if job_id is None: job_id = self.allocate_job_id() task_id = 'root:%s' % (job_id, ) # TODO: Here is where we will set up the job journal, etc. job_dir = self.make_job_directory(job_id) # TODO: Remove the global name directory dependency. try: expected_outputs = task_descriptor['expected_outputs'] for output in expected_outputs: self.global_name_directory.create_global_id(task_id, output) except KeyError: try: num_outputs = task_descriptor['num_outputs'] expected_outputs = map( lambda x: self.global_name_directory.create_global_id( task_id), range(0, num_outputs)) except: expected_outputs = [ self.global_name_directory.create_global_id() ] task_descriptor['expected_outputs'] = expected_outputs task = build_taskpool_task_from_descriptor(task_id, task_descriptor, self, None) job = Job(job_id, task, job_dir) task.job = job self.add_job(job) cherrypy.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO) return job
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load( result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack( journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read( root_task_descriptor_length) assert record_type == 'T' assert len( root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads( root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor( root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool) root_task.job = job if result is not None: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error( 'Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)