def getAllApps(rmAddress): xTime = getTime() rm=ResourceManager(address=rmAddress,port=rmPort,timeout=30) res=rm.cluster_applications(started_time_begin=str(xTime)).data runningApps = rm.cluster_applications(state="RUNNING").data if res.get("apps") is None: return "null" else: if runningApps.get("apps") is None: return res.get("apps").get("app") else: return res.get("apps").get("app")+runningApps.get("apps").get("app") print(runningApps.get("apps").get("app")+res.get("apps").get("app"))
def get24MinsApp(rmAddress): t = get0ClockTime() rm=ResourceManager(address=rmAddress,port=rmPort,timeout=30) res=rm.cluster_applications(started_time_begin=str(t)).data if res.get("apps") is None: return "null" else: tmpList = [] ll = res.get("apps").get("app") for i in ll: if i.get("name") == "com.yunchen.batch.BatchOneDay": tmpList.append(i) return tmpList
def __init__(self, zaddr=ZABBIX_ADDR, zport=ZABBIX_PORT, iface=None): self._API_TYPE = { 1: { 'API_ID': 'clusterInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/info', 'KEY_PREFIX': 'Info' }, 2: { 'API_ID': 'clusterMetrics', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/metrics', 'KEY_PREFIX': 'Metrics' }, 3: { 'API_ID': 'scheduler', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/scheduler', 'KEY_PREFIX': 'Scheduler' }, 4: { 'API_ID': 'apps', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/apps', 'KEY_PREFIX': 'Apps' }, 5: { 'API_ID': 'appStatInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/appstatistics', 'KEY_PREFIX': 'AppStatInfo' }, 6: { 'API_ID': 'nodes', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/nodes', 'KEY_PREFIX': 'Nodes' }, } self._type = 1 self._activerm = self._get_activerm() # self.apitype= apitype self.zaddr = zaddr self.zport = zport self.ret_result = [] self.final_result_dict = {} self.zbserver = ZabbixSender(zaddr, zport) self._ip = self._getLocalIP(iface) self.rm = ResourceManager(address=self._activerm, timeout=10)
def configure(self): resource_manager_address = self.config.get('resource_manager_address') port = self.config.get('port') if port: self.resource_manager = ResourceManager( address=resource_manager_address, port=port) self.app_master = ApplicationMaster( address=resource_manager_address, port=port) else: self.resource_manager = ResourceManager( address=resource_manager_address) self.app_master = ApplicationMaster( address=resource_manager_address) self.application_ids = self.config.get('application_ids') self.application_status = self.config.get('application_status') self.application_tags = self.config.get('application_tags') self.application_names = self.config.get('application_names') self.application_status_list = []
def main(): YarnLog.writeLog('init logging') # init_logging() # RM = rm_active_standby() RM = ResourceManager(address=RM_HOST, port=RM_PORT) AM = ApplicationMaster(address=AM_HOST, port=AM_PORT) appsDict = update_applicatioins_map(RM) if YarnConfig.getConfig('MonitorSwitch', 'FinishedSwitch') == 'True': logging.info( '############################ Filter FINISHED Application and Start it #################################' ) # when spark application finished, start # finishedswitch = True failedAppsDict = filter_apps_state(appsDict, 'FINISHED') start_spark(failedAppsDict) if YarnConfig.getConfig('MonitorSwitch', 'FailedSwitch') == 'True': logging.info( '############################ Filter FAILED Application and Start it #################################' ) # when spark application failed, start failedAppsDict = filter_apps_state(appsDict, 'FAILED') start_spark(failedAppsDict) if YarnConfig.getConfig('MonitorSwitch', 'KilledSwitch') == 'True': logging.info( '############################ Filter KILLED Application and Start it #################################' ) # when spark application killed by user, start killedAppsDict = filter_apps_state(appsDict, 'KILLED') start_spark(killedAppsDict) if YarnConfig.getConfig('MonitorSwitch', 'RunningSwitch') == 'True': logging.info( '############################ Filter RUNNING Application and Start it #################################' ) # when spark application is running # but the duration time of the current job for more then 2 minute; kill and will be restart by next time runningAppsDict = filter_apps_state(appsDict, 'RUNNING') durationAppDict = apps_running_duration(runningAppsDict, AM) # start_spark(durationAppDict) kill_spark(durationAppDict) logging.info( '############################ Ending ###################################' )
def __init__(self, zaddr=ZABBIX_ADDR,zport=ZABBIX_PORT,iface=None): self._API_TYPE = { 1: { 'API_ID': 'clusterInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/info', 'KEY_PREFIX': 'Info' }, 2: { 'API_ID': 'clusterMetrics', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/metrics', 'KEY_PREFIX': 'Metrics' }, 3:{ 'API_ID': 'scheduler', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/scheduler', 'KEY_PREFIX': 'Scheduler' }, 4:{ 'API_ID': 'apps', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/apps', 'KEY_PREFIX': 'Apps' }, 5: { 'API_ID':'appStatInfo', 'API_PREFIX':'RM', 'API_ADDRESS':'http://RMADDRESS:8088/ws/v1/cluster/appstatistics', 'KEY_PREFIX':'AppStatInfo' }, 6: { 'API_ID':'nodes', 'API_PREFIX':'RM', 'API_ADDRESS':'http://RMADDRESS:8088/ws/v1/cluster/nodes', 'KEY_PREFIX':'Nodes' }, } self._type = 1 self._activerm = self._get_activerm() # self.apitype= apitype self.zaddr = zaddr self.zport = zport self.ret_result = [] self.final_result_dict ={} self.zbserver = ZabbixSender(zaddr, zport) self._ip = self._getLocalIP(iface) self.rm = ResourceManager(address=self._activerm,timeout=10)
def rm_active_standby(active=None, back=None): active = ResourceManager(address=RM_HOST, port=RM_PORT) back = ResourceManager(address=RM_HOST_BAK, port=RM_PORT) activeHaState = active.cluster_information().data.get('clusterInfo').get( 'haState') backHaState = back.cluster_information().data.get('clusterInfo').get( 'haState') # state : ACTIVE, STANDBY if activeHaState == 'ACTIVE': logging.info("ResourceManager host = " + active.address + " is active") return active elif backHaState == 'ACTIVE': logging.info("ResourceManager host = " + back.address + " is active") return back else: logging.warning("No ResourceManager can be usered, Please check again")
def run(self): analysis_timestamp = str(datetime.now()) output_path = os.path.join( 'daily_leader_boards', 'leader_board_' + str(self.jobs_year) + '-' + str(self.jobs_month).zfill(2) + '-' + str(self.jobs_day).zfill(2) + '.csv') rm = ResourceManager(configuration.yarn_resource_managers) metrics = rm.cluster_metrics() cluster_vcores_total = metrics.data['clusterMetrics'][ 'totalVirtualCores'] cluster_daily_vcore_seconds = int(cluster_vcores_total * 60 * 60 * 24) cluster_memory_total_mb = metrics.data['clusterMetrics']['totalMB'] cluster_daily_megabyte_memory_seconds = int(cluster_memory_total_mb * 60 * 60 * 24) begin_date = datetime(int(str(self.jobs_year)), int(str(self.jobs_month)), int(str(self.jobs_day))) end_date = begin_date + timedelta(1) begin_ms = str(int(begin_date.timestamp() * 1000)) end_ms = str(int(end_date.timestamp() * 1000)) # filter out jobs that started after the end of the analyzed day apps = rm.cluster_applications( # finished_time_begin=begin_ms, started_time_end=end_ms) applist = apps.data['apps']['app'] total_vcore_seconds = 0 total_mb_seconds = 0 sum_elapsed_time_ms = 0 overall_started_time_ms = 9999999999999 overall_finished_time_ms = 0 total_yarn_apps = 0 users = {} app_file = 'app_lists/apps_' + str(self.jobs_year) \ + '-' + str(self.jobs_month).zfill(2) \ + '-' + str(self.jobs_day).zfill(2) + '.csv' apps_df = pd.DataFrame(applist) apps_df.to_csv(app_file) for app in applist: begin_ms_int = int(begin_ms) end_ms_int = int(end_ms) started_time = app['startedTime'] finished_time = app['finishedTime'] elapsed_time = app['elapsedTime'] # disregard apps that haven't ever or yet consumed any resources if app['state'] not in ['FINISHED', 'FAILED', 'KILLED', 'RUNNING']: continue # disregard apps that finished before the beginning of the analyzed day if 0 < finished_time < begin_ms_int: continue # for scenario where job began and ended in the same day percent_within_day = 1.0 # scenario where job began before the beginning of the day and ended before the end of the day if started_time < begin_ms_int < finished_time < end_ms_int: percent_within_day = (finished_time - begin_ms_int) / elapsed_time # scenario where job began before the beginning of the day and continued beyond the end of the day if started_time < begin_ms_int and (finished_time == 0 or finished_time > end_ms_int): percent_within_day = 86400000 / elapsed_time # scenario where job began before the end of the day and continued beyond the end of the day if begin_ms_int < started_time < end_ms_int \ and (finished_time == 0 or end_ms_int < finished_time): percent_within_day = (end_ms_int - started_time) / elapsed_time weighted_app_vcore_seconds = int(app['vcoreSeconds'] * percent_within_day) weighted_app_memory_seconds = int(app['memorySeconds'] * percent_within_day) user = users.setdefault( app['user'], { 'user_first_task_started_time_ms': 9999999999999, 'last_task_finished_time_ms': 0 }) total_vcore_seconds += weighted_app_vcore_seconds total_mb_seconds += weighted_app_memory_seconds user['user_first_task_started_time_ms'] = app['startedTime'] \ if app['startedTime'] < user['user_first_task_started_time_ms'] \ else user['user_first_task_started_time_ms'] user['last_task_finished_time_ms'] = app['finishedTime'] \ if app['finishedTime'] > user['last_task_finished_time_ms'] \ else user['last_task_finished_time_ms'] overall_started_time_ms = app['startedTime'] if app['startedTime'] < overall_started_time_ms \ else overall_started_time_ms overall_finished_time_ms = app['finishedTime'] if app['finishedTime'] > overall_finished_time_ms \ else overall_finished_time_ms sum_elapsed_time_ms += app['elapsedTime'] total_yarn_apps += 1 user_total_vcore_seconds = user.setdefault('total_vcore_seconds', 0) user[ 'total_vcore_seconds'] = user_total_vcore_seconds + weighted_app_vcore_seconds user_total_mb_seconds = user.setdefault('total_MB_seconds', 0) user[ 'total_MB_seconds'] = user_total_mb_seconds + weighted_app_memory_seconds header = [ 'jobs_year', 'jobs_month', 'jobs_day', 'cluster_daily_vcore_seconds', 'cluster_daily_megabyte_memory_seconds', 'user', 'used_vcore_seconds', 'percent_used_of_all_used_vcore_seconds', 'percent_used_of_total_cluster_vcore_seconds', 'used_MB_seconds', 'percent_used_of_all_used_MB_seconds', 'percent_used_of_total_cluster_MB_seconds', 'user_first_task_started_time', 'user_last_task_finished_time' ] table = [] for user in users: # set last_task_finished_time to None if timestamp == 0 representing that the task hasn't finished yet if int(users[user]['last_task_finished_time_ms']) == 0: last_task_finished_time_string = '' else: last_task_finished_time_string = \ datetime.fromtimestamp(users[user]['last_task_finished_time_ms'] / 1000.0)\ .strftime('%Y-%m-%d %H:%M') row = [ self.jobs_year, self.jobs_month, self.jobs_day, cluster_daily_vcore_seconds, cluster_daily_megabyte_memory_seconds, user, round(users[user]['total_vcore_seconds'], 0), round( 100 * users[user]['total_vcore_seconds'] / total_vcore_seconds, 2), round( 100 * users[user]['total_vcore_seconds'] / cluster_daily_vcore_seconds, 2), round(users[user]['total_MB_seconds'], 0), round(100 * users[user]['total_MB_seconds'] / total_mb_seconds, 2), round( 100 * users[user]['total_MB_seconds'] / cluster_daily_megabyte_memory_seconds, 2), datetime.fromtimestamp( users[user]['user_first_task_started_time_ms'] / 1000.0).strftime('%Y-%m-%d %H:%M'), last_task_finished_time_string, ] table.append(row) df = pd.DataFrame(table, columns=header) df = df.sort_values(by='used_MB_seconds', ascending=False) print() print('analysis timestamp: ' + analysis_timestamp) # print('functional account:', job_user) print('jobs date: ' + begin_date.strftime('%Y-%m-%d')) print('----------------------') print('count of yarn apps: ' + str(total_yarn_apps)) print( 'overall daily jobs started time ', datetime.fromtimestamp(overall_started_time_ms / 1000.0).strftime('%Y-%m-%d %H:%M')) print( 'overall daily jobs finished time', datetime.fromtimestamp(overall_finished_time_ms / 1000.0).strftime('%Y-%m-%d %H:%M')) print() print(tabulate(df, headers='keys', showindex=False)) df.to_csv(output_path, index=False)
import urllib2 reload(sys) import os from yarn_api_client import ResourceManager import datetime if __name__ == '__main__': Url = "http://172.16.11.225:9091/action/receiver.do" endtime = datetime.datetime.now() starttime = endtime-datetime.timedelta(days=7) print starttime,endtime starttimestamp = int(time.mktime(starttime.timetuple())*1000) endtimestamp = int(time.mktime(endtime.timetuple())*1000) print starttimestamp,endtimestamp try :#访问yarn取得任务信息 monitor=ResourceManager("172.16.11.209", 8088, 30) out = monitor.cluster_applications(None, None, None, None, None, None, None, str(starttimestamp), str(endtimestamp)) except : monitor=ResourceManager("172.16.11.208", 8088, 30) out = monitor.cluster_applications(None, None, None, None, None, None, None, str(starttimestamp), str(endtimestamp)) if out.data['apps'] is None : print 'error' exit() applicationlist = out.data['apps']['app'] excutorMap = {}#存储每个user的消耗资源总量 for applicationinfo in applicationlist: user = applicationinfo['user'] vcoreSeconds = int(applicationinfo['vcoreSeconds']) if excutorMap.has_key(user) : excutorMap[user] = excutorMap[user] + vcoreSeconds else :
def get_state(application_id): rm = ResourceManager([config.yarn_url]) response = rm.cluster_application(application_id).data app = response['app'] state = app['state'] return state
class HadoopJobPoller: _NAME_ = "Mapr Hadoop Job Poller" def __init__(self, config): logger.info('Initialise {}'.format(self.get_name())) self.config = config self.configure() self.result = [] def load_config(self, config): logger.debug('Loading config: {}'.format(config)) self.config = config self.configure() def get_name(self): return HadoopJobPoller._NAME_ def configure(self): resource_manager_address = self.config.get('resource_manager_address') port = self.config.get('port') if port: self.resource_manager = ResourceManager( address=resource_manager_address, port=port) self.app_master = ApplicationMaster( address=resource_manager_address, port=port) else: self.resource_manager = ResourceManager( address=resource_manager_address) self.app_master = ApplicationMaster( address=resource_manager_address) self.application_ids = self.config.get('application_ids') self.application_status = self.config.get('application_status') self.application_tags = self.config.get('application_tags') self.application_names = self.config.get('application_names') self.application_status_list = [] def __update_result(self, result={}): result.update({'time': time.time()}) self.result.append(result) def poll(self): logger.info("Starting {} poll".format(self.get_name())) try: self.__application_details() success_status = { "status": "COMPLETED", "status_message": "Hadoop Job poll completed successfully", "applications_status": self.application_status_list } logger.info("Successfully completed {} poll".format( self.get_name())) return self.result, success_status except Exception as e: logger.error("Exception in {} poll :{}".format( self.get_name(), str(e))) exception_status = { "status": "EXCEPTION", "status_message": str(e) } return self.result, exception_status def __application_details(self): self.cluster_id = self.resource_manager.cluster_information().data.get( 'clusterInfo').get('id') app_list = self.resource_manager.cluster_applications().data.get( 'apps') if app_list: for app in app_list.get('app'): if app.get('state').lower() == 'running': jobs = (self.app_master.jobs( app.get('id')).data.get('jobs').get('job')) result_job_list = [] for job in jobs: task = self.app_master.job_tasks( app.get('id'), job.get('id')).data job.update(task) result_job_list.append(job) app.update({"jobs": result_job_list}) # Condition fetch application based on id if (self.application_names is not None) and (app.get('name') not in self.application_names): continue # Condition fetch application based on id if (self.application_ids is not None) and (app.get('id') not in self.application_ids): continue # Condition fetch application based on id if (self.application_status is not None) and ( app.get('state').lower() != self.application_status): continue # Condition fetch application based on id if (self.application_tags is not None) and (app.get('applicationTags') not in self.application_tags): continue app_status = { 'application_id': app.get('id'), 'application_name': app.get('name'), 'status': app.get('state') } self.application_status_list.append(app_status) self.__update_result(app)
class ZabbixHadoop: def __init__(self, zaddr=ZABBIX_ADDR,zport=ZABBIX_PORT,iface=None): self._API_TYPE = { 1: { 'API_ID': 'clusterInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/info', 'KEY_PREFIX': 'Info' }, 2: { 'API_ID': 'clusterMetrics', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/metrics', 'KEY_PREFIX': 'Metrics' }, 3:{ 'API_ID': 'scheduler', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/scheduler', 'KEY_PREFIX': 'Scheduler' }, 4:{ 'API_ID': 'apps', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/apps', 'KEY_PREFIX': 'Apps' }, 5: { 'API_ID':'appStatInfo', 'API_PREFIX':'RM', 'API_ADDRESS':'http://RMADDRESS:8088/ws/v1/cluster/appstatistics', 'KEY_PREFIX':'AppStatInfo' }, 6: { 'API_ID':'nodes', 'API_PREFIX':'RM', 'API_ADDRESS':'http://RMADDRESS:8088/ws/v1/cluster/nodes', 'KEY_PREFIX':'Nodes' }, } self._type = 1 self._activerm = self._get_activerm() # self.apitype= apitype self.zaddr = zaddr self.zport = zport self.ret_result = [] self.final_result_dict ={} self.zbserver = ZabbixSender(zaddr, zport) self._ip = self._getLocalIP(iface) self.rm = ResourceManager(address=self._activerm,timeout=10) def _getLocalIP(self,iface): for i in interfaces(): if i == iface: return ifaddresses(i)[2][0]['addr'] def collect_app_stats(self, state_list=None, type_list=None): self._type = 5 self.final_result_dict ={} self.ret_result= self.rm.cluster_application_statistics(state_list=state_list, application_type_list=state_list).data[ self._API_TYPE[self._type]['API_ID'] ]['statItem'] for i in self.ret_result: self.final_result_dict[i['state']] = i['count'] if len(self.final_result_dict) != 0: self._send_zabbix() def colletc_app_metric(self,state=None, final_status=None, user=None, queue=None, limit=None, started_time_begin=None, started_time_end=None, finished_time_begin=None, finished_time_end=None): self._type = 4 self.final_result_dict ={} self.ret_result = self.rm.cluster_applications(state=None, final_status=None, user=None, queue=None, limit=None, started_time_begin=None, started_time_end=None, finished_time_begin=None, finished_time_end=None).data[ self._API_TYPE[self._type]['API_ID'] ]['app'] for i in self.ret_result: if i['finalStatus']==u'FAILED' or i['finalStatus']==u'KILLED' : if self.final_result_dict.has_key(i['finalStatus']): self.final_result_dict[i['finalStatus']] = '%s, %s:%s:%s' % (self.final_result_dict[i['finalStatus']], i['user'],i['name'],i['queue']) else: self.final_result_dict[i['finalStatus']] = '%s:%s:%s' % (i['user'],i['name'],i['queue']) if len(self.final_result_dict) != 0: self._send_zabbix() def collect_cluster_metrics(self): self._type = 2 self.final_result_dict = {} self.ret_result = self.rm.cluster_metrics().data[ self._API_TYPE[self._type]['API_ID'] ] self.final_result_dict['mem_usage'] = self.ret_result['allocatedMB']/self.ret_result['totalMB'] self.final_result_dict['vcore_usage'] = self.ret_result['allocatedVirtualCores']/self.ret_result['totalVirtualCores'] self.final_result_dict['unhealthyNodes'] = self.ret_result['unhealthyNodes'] if len(self.final_result_dict) != 0: self._send_zabbix() def collect_scheduler_metrics(self): self._type = 3 self.final_result_dict = {} self.ret_result = self.rm.cluster_scheduler().data[ self._API_TYPE[self._type]['API_ID'] ]['schedulerInfo'] self.final_result_dict['root_used_capacity'] = self.ret_result['usedCapacity'] for index,queue in enumerate(self.ret_result['queues']['queue']): self.final_result_dict['queue'+str(index)+'_load'] = queue['usedCapacity'] if len(self.final_result_dict) != 0: self._send_zabbix() def _send_zabbix(self): packet = ZabbixPacket() for k,v in self.final_result_dict.iteritems(): packet.add(self._API_TYPE[self._type]['API_PREFIX']+'_'+self._ip, self._API_TYPE[self._type]['KEY_PREFIX']+'['+k+']', v) return 0 self.zbserver.send(packet) print self.zbserver.status def _get_activerm(self): for addr in RM_ADDR: ret_val = requests.get(self._API_TYPE[self._type]['API_ADDRESS'].replace('RMADDRESS',addr)) if ret_val.status_code == 200: json_val = ret_val.json()[self._API_TYPE[self._type]['API_ID']] if json_val['haState'] == 'ACTIVE' and json_val['state'] == 'STARTED': return addr
class ZabbixHadoop: def __init__(self, zaddr=ZABBIX_ADDR, zport=ZABBIX_PORT, iface=None): self._API_TYPE = { 1: { 'API_ID': 'clusterInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/info', 'KEY_PREFIX': 'Info' }, 2: { 'API_ID': 'clusterMetrics', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/metrics', 'KEY_PREFIX': 'Metrics' }, 3: { 'API_ID': 'scheduler', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/scheduler', 'KEY_PREFIX': 'Scheduler' }, 4: { 'API_ID': 'apps', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/apps', 'KEY_PREFIX': 'Apps' }, 5: { 'API_ID': 'appStatInfo', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/appstatistics', 'KEY_PREFIX': 'AppStatInfo' }, 6: { 'API_ID': 'nodes', 'API_PREFIX': 'RM', 'API_ADDRESS': 'http://RMADDRESS:8088/ws/v1/cluster/nodes', 'KEY_PREFIX': 'Nodes' }, } self._type = 1 self._activerm = self._get_activerm() # self.apitype= apitype self.zaddr = zaddr self.zport = zport self.ret_result = [] self.final_result_dict = {} self.zbserver = ZabbixSender(zaddr, zport) self._ip = self._getLocalIP(iface) self.rm = ResourceManager(address=self._activerm, timeout=10) def _getLocalIP(self, iface): for i in interfaces(): if i == iface: return ifaddresses(i)[2][0]['addr'] def collect_app_stats(self, state_list=None, type_list=None): self._type = 5 self.final_result_dict = {} self.ret_result = self.rm.cluster_application_statistics( state_list=state_list, application_type_list=state_list).data[ self._API_TYPE[self._type]['API_ID']]['statItem'] for i in self.ret_result: self.final_result_dict[i['state']] = i['count'] if len(self.final_result_dict) != 0: self._send_zabbix() def colletc_app_metric(self, state=None, final_status=None, user=None, queue=None, limit=None, started_time_begin=None, started_time_end=None, finished_time_begin=None, finished_time_end=None): self._type = 4 self.final_result_dict = {} self.ret_result = self.rm.cluster_applications( state=None, final_status=None, user=None, queue=None, limit=None, started_time_begin=None, started_time_end=None, finished_time_begin=None, finished_time_end=None).data[self._API_TYPE[self._type] ['API_ID']]['app'] for i in self.ret_result: if i['finalStatus'] == u'FAILED' or i['finalStatus'] == u'KILLED': if self.final_result_dict.has_key(i['finalStatus']): self.final_result_dict[ i['finalStatus']] = '%s, %s:%s:%s' % ( self.final_result_dict[i['finalStatus']], i['user'], i['name'], i['queue']) else: self.final_result_dict[i['finalStatus']] = '%s:%s:%s' % ( i['user'], i['name'], i['queue']) if len(self.final_result_dict) != 0: self._send_zabbix() def collect_cluster_metrics(self): self._type = 2 self.final_result_dict = {} self.ret_result = self.rm.cluster_metrics().data[self._API_TYPE[ self._type]['API_ID']] self.final_result_dict['mem_usage'] = self.ret_result[ 'allocatedMB'] / self.ret_result['totalMB'] self.final_result_dict['vcore_usage'] = self.ret_result[ 'allocatedVirtualCores'] / self.ret_result['totalVirtualCores'] self.final_result_dict['unhealthyNodes'] = self.ret_result[ 'unhealthyNodes'] if len(self.final_result_dict) != 0: self._send_zabbix() def collect_scheduler_metrics(self): self._type = 3 self.final_result_dict = {} self.ret_result = self.rm.cluster_scheduler().data[self._API_TYPE[ self._type]['API_ID']]['schedulerInfo'] self.final_result_dict['root_used_capacity'] = self.ret_result[ 'usedCapacity'] for index, queue in enumerate(self.ret_result['queues']['queue']): self.final_result_dict['queue' + str(index) + '_load'] = queue['usedCapacity'] if len(self.final_result_dict) != 0: self._send_zabbix() def _send_zabbix(self): packet = ZabbixPacket() for k, v in self.final_result_dict.iteritems(): packet.add( self._API_TYPE[self._type]['API_PREFIX'] + '_' + self._ip, self._API_TYPE[self._type]['KEY_PREFIX'] + '[' + k + ']', v) return 0 self.zbserver.send(packet) print self.zbserver.status def _get_activerm(self): for addr in RM_ADDR: ret_val = requests.get( self._API_TYPE[self._type]['API_ADDRESS'].replace( 'RMADDRESS', addr)) if ret_val.status_code == 200: json_val = ret_val.json()[self._API_TYPE[self._type]['API_ID']] if json_val['haState'] == 'ACTIVE' and json_val[ 'state'] == 'STARTED': return addr
# 文件名称:cpu_monitor.py # 功能描述: # 1.监控华为yarn资源 # 2.监控119、120主机硬盘空间 # 功能描述: # 功能描述: # 输 入 表: # 输 出 表: # 创 建 者:hyn # 创建日期:20191023 # 修改日志: # 修改日期: # *************************************************************************** # 程序调用格式:python cpu_monitor.py # *************************************************************************** import os import sys import time import json import datetime result_list = [] from yarn_api_client import ApplicationMaster, HistoryServer, NodeManager, ResourceManager #rm =ResourceManager(address='10.93.171.97',port='8088') #rm =ResourceManager(service_endpoints='10.93.171.97',port='8088') rm = ResourceManager(service_endpoints='10.93.171.97:8088') print(rm.cluster_applications().data)
# 创 建 者:hyn # 创建日期:20200917 # 修改日志: # 修改日期: # *************************************************************************** # 程序调用格式:python yarn_monitor.py # *************************************************************************** import os import sys import json import time from yarn_api_client import ApplicationMaster, HistoryServer, NodeManager, ResourceManager rm = ResourceManager(service_endpoints=['http://172.19.168.100:8088', 'http://172.19.168.4:8088']) # print rm.cluster_information().data # # ApplicationMaster() # # NodeManager. # 过滤重要任务 ats = 'ats' thritf = 'Thrift' dis='dis' # 3个小时之前 # run_time = 10800 # 24小时之前
def get_info(application_id): rm = ResourceManager([config.yarn_url]) response = rm.cluster_application(application_id).data app = response['app'] return app