def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. # FIXME: Store job options somewhere for recovered job. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}, journal=False) root_task.job = job if result is not None: with job._lock: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error('Recovered task %s for job %s' % (root_task.task_id, job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def load_other_tasks_for_job(self, job, journal_file): ''' Process a the task journal for a recovered job. ''' try: while True: record_header = journal_file.read(RECORD_HEADER_STRUCT.size) if len(record_header) != RECORD_HEADER_STRUCT.size: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header) record_string = journal_file.read(record_length) if len(record_string) != record_length: ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False) # XXX: Need to truncate the journal file. break rec = simplejson.loads(record_string, object_hook=json_decode_object_hook) if record_type == 'R': job.task_graph.publish(rec['ref']) elif record_type == 'T': task_id = rec['task_id'] parent_task = job.task_graph.get_task(rec['parent']) task = build_taskpool_task_from_descriptor(rec, parent_task) task.job = job task.parent.children.append(task) ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False) job.task_graph.spawn(task) else: ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False) except: ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True) finally: journal_file.close() job.restart_journalling() if job.state == JOB_ACTIVE: ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)