class GearJobs(object): def __init__(self, logger, args): self.logger = logger self.gm_client = JSONGearmanClient(args.server) def send_pings(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=5.0 ) for ping in submitted_pings: if ping.state == 'UNKNOWN': # TODO: Gearman server failed, ignoring for now self.logger.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(ping.job.task) continue return failed_list def send_repair(self, node_list): list_of_jobs = [] repaired_list = [] job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=5.0 ) for ping in submitted_pings: if ping.state == 'UNKNOWN': # TODO: Gearman server failed, ignoring for now self.logger.error('Gearman Job server fail') continue elif ping.timed_out: # Ping timeout continue elif ping.result['hpcs_response'] == 'FAIL': # Error returned by Gearman continue else: repaired_list.append(ping.job.task) return repaired_list
class GearmanWork(object): def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list) def send_delete_message(self, message): LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=30.0 ) delete_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst deleting device') continue if status.result['response'] == 'FAIL': LOG.error( 'Pool manager failed to delete a device, removing from DB' ) delete_count += 1 with db_session() as session: session.query(Device).\ filter(Device.name == status.result['name']).delete() session.commit() LOG.info('%d freed devices delete from pool', delete_count) def send_vips_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0 ) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building vip') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a vip') continue built_count += 1 try: self._add_vip(status.result) except: LOG.exception( 'Could not add vip to DB, node data: {0}' .format(status.result) ) LOG.info( '{vips} vips built and added to pool'.format(vips=built_count) ) def send_create_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending {0} gearman messages".format(len(message))) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0 ) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building device') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a device') if 'name' in status.result: self._add_bad_node(status.result) continue built_count += 1 try: self._add_node(status.result) except: LOG.exception( 'Could not add node to DB, node data: {0}' .format(status.result) ) LOG.info( '{nodes} devices built and added to pool'.format(nodes=built_count) ) def _add_vip(self, data): LOG.info('Adding vip {0} to DB'.format(data['ip'])) vip = Vip() vip.ip = int(ipaddress.IPv4Address(unicode(data['ip']))) with db_session() as session: session.add(vip) session.commit() def _add_node(self, data): LOG.info('Adding device {0} to DB'.format(data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'OFFLINE' device.created = None with db_session() as session: session.add(device) session.commit() def _add_bad_node(self, data): LOG.info( 'Adding bad device {0} to DB to be deleted'.format(data['name']) ) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'DELETED' device.created = None with db_session() as session: session.add(device) session.commit()
class GearJobs(object): def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list) def send_pings(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] node_status = dict() retry_list = [] # The message name is STATS for historical reasons. Real # data statistics are gathered with METRICS messages. job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout retry_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ('status' in ping.result and ping.result['status'] == 'DELETED'): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] list_of_jobs = [] if len(retry_list) > 0: LOG.info("{0} pings timed out, retrying".format(len(retry_list))) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ('status' in ping.result and ping.result['status'] == 'DELETED'): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] return failed_list, node_status def offline_check(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "DIAGNOSTICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: LOG.error( "Gearman Job server failed during OFFLINE check of {0}". format(ping.job.task)) elif ping.timed_out: failed_list.append(ping.job.task) elif ping.result['network'] == 'FAIL': failed_list.append(ping.job.task) else: gearman_count = 0 gearman_fail = 0 for gearman_test in ping.result['gearman']: gearman_count += 1 if gearman_test['status'] == 'FAIL': gearman_fail += 1 # Need 2/3rds gearman up max_fail_count = gearman_count / 3 if gearman_fail > max_fail_count: failed_list.append(ping.job.task) return failed_list def get_stats(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] retry_list = [] results = {} job_data = {"hpcs_action": "METRICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now retry_list.append(stats.job.task) elif stats.timed_out: # Timeout retry_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result list_of_jobs = [] if len(retry_list) > 0: LOG.info("{0} Statistics gathering timed out, retrying".format( len(retry_list))) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error("Gearman Job server failed gathering statistics " "on {0}".format(stats.job.task)) failed_list.append(stats.job.task) elif stats.timed_out: # Timeout failed_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result return failed_list, results
class GearJobs(object): def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list) def send_pings(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] node_status = dict() retry_list = [] # The message name is STATS for historical reasons. Real # data statistics are gathered with METRICS messages. job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout retry_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ( 'status' in ping.result and ping.result['status'] == 'DELETED' ): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] list_of_jobs = [] if len(retry_list) > 0: LOG.info( "{0} pings timed out, retrying".format(len(retry_list)) ) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ( 'status' in ping.result and ping.result['status'] == 'DELETED' ): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] return failed_list, node_status def offline_check(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "DIAGNOSTICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: LOG.error( "Gearman Job server failed during OFFLINE check of {0}". format(ping.job.task) ) elif ping.timed_out: failed_list.append(ping.job.task) elif ping.result['network'] == 'FAIL': failed_list.append(ping.job.task) else: gearman_count = 0 gearman_fail = 0 for gearman_test in ping.result['gearman']: gearman_count += 1 if gearman_test['status'] == 'FAIL': gearman_fail += 1 # Need 2/3rds gearman up max_fail_count = gearman_count / 3 if gearman_fail > max_fail_count: failed_list.append(ping.job.task) return failed_list def get_stats(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] retry_list = [] results = {} job_data = {"hpcs_action": "METRICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now retry_list.append(stats.job.task) elif stats.timed_out: # Timeout retry_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result list_of_jobs = [] if len(retry_list) > 0: LOG.info( "{0} Statistics gathering timed out, retrying". format(len(retry_list)) ) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry ) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error( "Gearman Job server failed gathering statistics " "on {0}".format(stats.job.task) ) failed_list.append(stats.job.task) elif stats.timed_out: # Timeout failed_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result return failed_list, results
class GearmanWork(object): def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list) def send_delete_message(self, message): LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=30.0) delete_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst deleting device') continue if status.result['response'] == 'FAIL': LOG.error( 'Pool manager failed to delete a device, removing from DB') delete_count += 1 with db_session() as session: session.query(Device).\ filter(Device.name == status.result['name']).delete() session.commit() LOG.info('%d freed devices delete from pool', delete_count) def send_vips_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building vip') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a vip') continue built_count += 1 try: self._add_vip(status.result) except: LOG.exception('Could not add vip to DB, node data: {0}'.format( status.result)) LOG.info( '{vips} vips built and added to pool'.format(vips=built_count)) def send_create_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending {0} gearman messages".format(len(message))) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building device') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a device') if 'name' in status.result: self._add_bad_node(status.result) continue built_count += 1 try: self._add_node(status.result) except: LOG.exception( 'Could not add node to DB, node data: {0}'.format( status.result)) LOG.info('{nodes} devices built and added to pool'.format( nodes=built_count)) def _add_vip(self, data): LOG.info('Adding vip {0} to DB'.format(data['ip'])) vip = Vip() vip.ip = int(ipaddress.IPv4Address(unicode(data['ip']))) with db_session() as session: session.add(vip) session.commit() def _add_node(self, data): LOG.info('Adding device {0} to DB'.format(data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'OFFLINE' device.created = None with db_session() as session: session.add(device) session.commit() def _add_bad_node(self, data): LOG.info('Adding bad device {0} to DB to be deleted'.format( data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'DELETED' device.created = None with db_session() as session: session.add(device) session.commit()