예제 #1
0
 def stop_job_monitoring(self, job_id):
     self.logger.debug('Stop monitoring job id %s' % (job_id))
     job_info = JobTracker.get_instance().get(job_id)
     if job_info is not None:
         job_info['monitor_job'] = False
예제 #2
0
    def analyze_job_status_for_host(self, job_id, host):
        self.logger.debug("Analyzing job status for job %s on host %s" %
                          (job_id, host))
        if job_id is None:
            self.logger.debug("no job id")
            return {}
        job_info = JobTracker.get_instance().get(job_id)
        if job_info is None:
            self.logger.debug("no job info")
            return {}
        job_status = job_info.get('job_status')
        if job_status is None:
            self.logger.debug("no job status")
            return {}

        try:
            djob_info = job_status.get('detailed_job_info').get(
                'djob_info').get('element')
            self.logger.trace("djob_info=%s" % djob_info)
            is_array = djob_info.get('JB_is_array')
            #            tasks = djob_info.get('JB_ja_tasks').get('ulong_sublist')
            #            if is_array == 'false':
            #                tasks = [tasks]
            if is_array == 'true':
                tasks = djob_info.get('JB_ja_tasks').get('ulong_sublist')
                if tasks is None:
                    self.logger.debug("no tasks in array job")
                    return {}
            else:
                task = djob_info.get('JB_ja_tasks').get('element')
                if task is None:
                    self.logger.debug("no tasks")
                    return {}
                tasks = [task]

            self.logger.trace("tasks=%s" % tasks)
            now = time.time()
            cpu_average = 0
            mem_average = 0
            for t in tasks:
                if t is None:
                    self.logger.trace("Task is None, continue task loop")
                    continue
                start_time = float(t.get('JAT_start_time')) / 1000.
                delta_t = now - start_time
                self.logger.trace("start_time=%s, now=%s, delta=%s" %
                                  (start_time, now, delta_t))
                task_number = t.get('JAT_task_number')
                scaled_usage_list = t.get('JAT_scaled_usage_list')
                if scaled_usage_list is None:
                    self.logger.trace(
                        "scaled_usage_list is None, continue task loop")
                    continue
                usage_list = scaled_usage_list.get('scaled')
                if usage_list == None:
                    usage_list = scaled_usage_list.get('Events')
                    if usage_list == None:
                        self.logger.trace(
                            "No scaled or Events in scaled_usage_list, continue task loop"
                        )
                        continue

                self.logger.trace("usage_list=%s" % usage_list)
                identifier_list = t.get(
                    'JAT_granted_destin_identifier_list').get('element')
                task_host = identifier_list.get('JG_qhostname')
                # Only analyze tasks on a given host
                if task_host != host:
                    self.logger.trace(
                        "task_host=%s (not %s), continue task loop" %
                        (task_host, host))
                    continue
                for u in usage_list:
                    u_name = u.get('UA_name')
                    if u_name == 'cpu':
                        u_value = float(u.get('UA_value'))
                        # value is integrated cpu seconds, so divide by time
                        # to get average cpu used
                        task_cpu_average = u_value / delta_t
                        cpu_average += task_cpu_average
                        self.logger.trace(
                            "u_value=%s, task_cpu_average=%s, cpu_average=%s" %
                            (u_value, task_cpu_average, cpu_average))
                    elif u_name == 'mem':
                        # value is integrated GB seconds, so divide by time
                        # to get average GB used
                        u_value = float(u.get('UA_value')) * 1024.
                        task_mem_average = u_value / delta_t
                        mem_average += task_mem_average
                        self.logger.trace(
                            "u_value=%s, task_mem_average=%s, mem_average=%s" %
                            (u_value, task_mem_average, mem_average))
            self.logger.trace("cpu_average=%s, mem_average=%s" %
                              (cpu_average, mem_average))
            return {'cpu_average': cpu_average, 'mem_average': mem_average}
        except Exception, ex:
            self.logger.warn(
                'Could not analyze job %s on host %s (tasks=%s): %s' %
                (job_id, host, tasks, ex))
예제 #3
0
 def start_job_monitoring(self, job_id, framework_id):
     self.logger.debug('Start monitoring job id %s for framework id %s' %
                       (job_id, framework_id))
     job_info = {'framework_id': framework_id, 'monitor_job': True}
     JobTracker.get_instance().add(job_id, job_info)