예제 #1
0
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error(
                        'Journal entry truncated for job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(
                    record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error(
                        'Journal entry truncated for job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)
                    break
                rec = simplejson.loads(record_string,
                                       object_hook=json_decode_object_hook)
                if record_type == 'R':
                    self.task_pool.publish_single_ref(rec['id'],
                                                      rec['ref'],
                                                      job,
                                                      should_journal=False)
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = self.task_pool.get_task_by_id(rec['parent'])
                    task = build_taskpool_task_from_descriptor(
                        rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)

                    ciel.log.error(
                        'Recovered task %s for job %s' % (task_id, job.id),
                        'RECOVERY', logging.INFO, False)
                    self.task_pool.add_task(task)
                else:
                    ciel.log.error(
                        'Got invalid record type in job %s' % job.id,
                        'RECOVERY', logging.WARNING, False)

        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id,
                           'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id,
                               'RECOVERY', logging.INFO)
예제 #2
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(root_task_descriptor, None)
                
                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                # FIXME: Store job options somewhere for recovered job.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED, self.job_pool, {})
                
                root_task.job = job
                if result is not None:
                    with job._lock:
                        job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    ciel.log.error('Recovered task %s for job %s' % (root_task['task_id'], job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
예제 #3
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return
        
        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(result_file, object_hook=json_decode_object_hook)
                else:
                    result = None
                    
                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(root_task_descriptor_length)
                assert record_type == 'T'
                assert len(root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(root_task_descriptor_string, object_hook=json_decode_object_hook)
                root_task_id = root_task_descriptor['task_id']
                root_task = build_taskpool_task_from_descriptor(root_task_id, root_task_descriptor, self.task_pool, None)
                job = Job(job_id, root_task, job_dir)
                root_task.job = job
                if result is not None:
                    job.completed(result)
                self.job_pool.add_job(job)
                self.task_pool.add_task(root_task)
                
                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    cherrypy.log.error('Recovered job %s' % job_id, 'RECOVERY', logging.INFO, False)
                    cherrypy.log.error('Recovered task %s for job %s' % (root_task_id, job_id), 'RECOVERY', logging.INFO, False)
                else:
                    journal_file.close()
                    cherrypy.log.error('Found information about job %s' % job_id, 'RECOVERY', logging.INFO, False)
                
                
            except:
                # We have lost critical data for the job, so we must fail it.
                cherrypy.log.error('Error recovering job %s' % job_id, 'RECOVERY', logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)
예제 #4
0
파일: recovery.py 프로젝트: ms705/ciel
    def load_other_tasks_for_job(self, job, journal_file):
        '''
        Process a the task journal for a recovered job.
        '''
        try:
            while True:
                record_header = journal_file.read(RECORD_HEADER_STRUCT.size)
                if len(record_header) != RECORD_HEADER_STRUCT.size:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                record_type, record_length = RECORD_HEADER_STRUCT.unpack(record_header)
                record_string = journal_file.read(record_length)
                if len(record_string) != record_length:
                    ciel.log.error('Journal entry truncated for job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                    # XXX: Need to truncate the journal file.
                    break
                rec = simplejson.loads(record_string, object_hook=json_decode_object_hook)
                if record_type == 'R':
                    job.task_graph.publish(rec['ref'])
                elif record_type == 'T':
                    task_id = rec['task_id']
                    parent_task = job.task_graph.get_task(rec['parent'])
                    task = build_taskpool_task_from_descriptor(rec, parent_task)
                    task.job = job
                    task.parent.children.append(task)
    
                    ciel.log.error('Recovered task %s for job %s' % (task_id, job.id), 'RECOVERY', logging.INFO, False)
                    job.task_graph.spawn(task)
                else:
                    ciel.log.error('Got invalid record type in job %s' % job.id, 'RECOVERY', logging.WARNING, False)
                
        except:
            ciel.log.error('Error recovering task_journal for job %s' % job.id, 'RECOVERY', logging.WARNING, True)

        finally:
            journal_file.close()
            job.restart_journalling()
            if job.state == JOB_ACTIVE:
                ciel.log.error('Restarting recovered job %s' % job.id, 'RECOVERY', logging.INFO)
예제 #5
0
    def recover_job_descriptors(self):
        root = self.job_pool.journal_root
        if root is None:
            return

        for job_id in os.listdir(root):

            try:
                job_dir = os.path.join(root, job_id)
                result_path = os.path.join(job_dir, 'result')
                if os.path.exists(result_path):
                    with open(result_path, 'r') as result_file:
                        result = simplejson.load(
                            result_file, object_hook=json_decode_object_hook)
                else:
                    result = None

                journal_path = os.path.join(job_dir, 'task_journal')
                journal_file = open(journal_path, 'rb')
                record_type, root_task_descriptor_length = RECORD_HEADER_STRUCT.unpack(
                    journal_file.read(RECORD_HEADER_STRUCT.size))
                root_task_descriptor_string = journal_file.read(
                    root_task_descriptor_length)
                assert record_type == 'T'
                assert len(
                    root_task_descriptor_string) == root_task_descriptor_length
                root_task_descriptor = simplejson.loads(
                    root_task_descriptor_string,
                    object_hook=json_decode_object_hook)
                root_task = build_taskpool_task_from_descriptor(
                    root_task_descriptor, None)

                # FIXME: Get the job pool to create this job, because it has access to the scheduler queue and task failure investigator.
                job = Job(job_id, root_task, job_dir, JOB_RECOVERED,
                          self.job_pool)

                root_task.job = job
                if result is not None:
                    job.completed(result)
                self.job_pool.add_job(job)
                # Adding the job to the job pool should add the root task.
                #self.task_pool.add_task(root_task)

                if result is None:
                    self.load_other_tasks_defer(job, journal_file)
                    ciel.log.error('Recovered job %s' % job_id, 'RECOVERY',
                                   logging.INFO, False)
                    ciel.log.error(
                        'Recovered task %s for job %s' %
                        (root_task['task_id'], job_id), 'RECOVERY',
                        logging.INFO, False)
                else:
                    journal_file.close()
                    ciel.log.error('Found information about job %s' % job_id,
                                   'RECOVERY', logging.INFO, False)

            except:
                # We have lost critical data for the job, so we must fail it.
                ciel.log.error('Error recovering job %s' % job_id, 'RECOVERY',
                               logging.ERROR, True)
                self.job_pool.add_failed_job(job_id)