def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. # FIXME: Store job options somewhere for recovered job. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {}) root_task.job = job if result is not None: with job._lock: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error('Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load(result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read(root_task_descriptor_length) assert record_type == 'T' assert len(root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook) root_task_id = root_task_descriptor['task_id'] root_task = build_taskpool_task_from_descriptor(root_task_id, root_task_descriptor, self.task_pool, None) job = Job(job_id, root_task, job_dir) root_task.job = job if result is not None: job.completed(result) self.job_pool.add_job(job) self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) cherrypy.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) cherrypy.log.error('Recovered task %s for job %s' % (root_task_id, job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() cherrypy.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. cherrypy.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)
def recover_job_descriptors(self): root = self.job_pool.journal_root if root is None: return for job_id in os.listdir(root): try: job_dir = os.path.join(root, job_id) result_path = os.path.join(job_dir, 'result') if os.path.exists(result_path): with open(result_path, 'r') as result_file: result = simplejson.load( result_file, object_hook=json_decode_object_hook) else: result = None journal_path = os.path.join(job_dir, 'task_journal') journal_file = open(journal_path, 'rb') record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack( journal_file.read(RECORD_HEADER_STRUCT.size)) root_task_descriptor_string = journal_file.read( root_task_descriptor_length) assert record_type == 'T' assert len( root_task_descriptor_string) == root_task_descriptor_length root_task_descriptor = simplejson.loads( root_task_descriptor_string, object_hook=json_decode_object_hook) root_task = build_taskpool_task_from_descriptor( root_task_descriptor, None) # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator. job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool) root_task.job = job if result is not None: job.completed(result) self.job_pool.add_job(job) # Adding the job to the job pool should add the root task. #self.task_pool.add_task(root_task) if result is None: self.load_other_tasks_defer(job, journal_file) ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False) ciel.log.error( 'Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False) else: journal_file.close() ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False) except: # We have lost critical data for the job, so we must fail it. ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True) self.job_pool.add_failed_job(job_id)