def stop_job_monitoring(self, job_id): self.logger.debug('Stop monitoring job id %s' % (job_id)) job_info = JobTracker.get_instance().get(job_id) if job_info is not None: job_info['monitor_job'] = False
def analyze_job_status_for_host(self, job_id, host): self.logger.debug("Analyzing job status for job %s on host %s" % (job_id, host)) if job_id is None: self.logger.debug("no job id") return {} job_info = JobTracker.get_instance().get(job_id) if job_info is None: self.logger.debug("no job info") return {} job_status = job_info.get('job_status') if job_status is None: self.logger.debug("no job status") return {} try: djob_info = job_status.get('detailed_job_info').get( 'djob_info').get('element') self.logger.trace("djob_info=%s" % djob_info) is_array = djob_info.get('JB_is_array') # tasks = djob_info.get('JB_ja_tasks').get('ulong_sublist') # if is_array == 'false': # tasks = [tasks] if is_array == 'true': tasks = djob_info.get('JB_ja_tasks').get('ulong_sublist') if tasks is None: self.logger.debug("no tasks in array job") return {} else: task = djob_info.get('JB_ja_tasks').get('element') if task is None: self.logger.debug("no tasks") return {} tasks = [task] self.logger.trace("tasks=%s" % tasks) now = time.time() cpu_average = 0 mem_average = 0 for t in tasks: if t is None: self.logger.trace("Task is None, continue task loop") continue start_time = float(t.get('JAT_start_time')) / 1000. delta_t = now - start_time self.logger.trace("start_time=%s, now=%s, delta=%s" % (start_time, now, delta_t)) task_number = t.get('JAT_task_number') scaled_usage_list = t.get('JAT_scaled_usage_list') if scaled_usage_list is None: self.logger.trace( "scaled_usage_list is None, continue task loop") continue usage_list = scaled_usage_list.get('scaled') if usage_list == None: usage_list = scaled_usage_list.get('Events') if usage_list == None: self.logger.trace( "No scaled or Events in scaled_usage_list, continue task loop" ) continue self.logger.trace("usage_list=%s" % usage_list) identifier_list = t.get( 'JAT_granted_destin_identifier_list').get('element') task_host = identifier_list.get('JG_qhostname') # Only analyze tasks on a given host if task_host != host: self.logger.trace( "task_host=%s (not %s), continue task loop" % (task_host, host)) continue for u in usage_list: u_name = u.get('UA_name') if u_name == 'cpu': u_value = float(u.get('UA_value')) # value is integrated cpu seconds, so divide by time # to get average cpu used task_cpu_average = u_value / delta_t cpu_average += task_cpu_average self.logger.trace( "u_value=%s, task_cpu_average=%s, cpu_average=%s" % (u_value, task_cpu_average, cpu_average)) elif u_name == 'mem': # value is integrated GB seconds, so divide by time # to get average GB used u_value = float(u.get('UA_value')) * 1024. task_mem_average = u_value / delta_t mem_average += task_mem_average self.logger.trace( "u_value=%s, task_mem_average=%s, mem_average=%s" % (u_value, task_mem_average, mem_average)) self.logger.trace("cpu_average=%s, mem_average=%s" % (cpu_average, mem_average)) return {'cpu_average': cpu_average, 'mem_average': mem_average} except Exception, ex: self.logger.warn( 'Could not analyze job %s on host %s (tasks=%s): %s' % (job_id, host, tasks, ex))
def start_job_monitoring(self, job_id, framework_id): self.logger.debug('Start monitoring job id %s for framework id %s' % (job_id, framework_id)) job_info = {'framework_id': framework_id, 'monitor_job': True} JobTracker.get_instance().add(job_id, job_info)