Exemplo n.º 1
0
class GearJobs(object):
    def __init__(self, logger, args):
        self.logger = logger
        self.gm_client = JSONGearmanClient(args.server)

    def send_pings(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=5.0
        )
        for ping in submitted_pings:
            if ping.state == 'UNKNOWN':
                # TODO: Gearman server failed, ignoring for now
                self.logger.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                failed_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue

        return failed_list

    def send_repair(self, node_list):
        list_of_jobs = []
        repaired_list = []
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=5.0
        )
        for ping in submitted_pings:
            if ping.state == 'UNKNOWN':
                # TODO: Gearman server failed, ignoring for now
                self.logger.error('Gearman Job server fail')
                continue
            elif ping.timed_out:
                # Ping timeout
                continue
            elif ping.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                continue
            else:
                repaired_list.append(ping.job.task)

        return repaired_list
Exemplo n.º 2
0
class GearmanWork(object):

    def __init__(self):
        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']
                                })
        self.gearman_client = JSONGearmanClient(server_list)

    def send_delete_message(self, message):
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=30.0
        )
        delete_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst deleting device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error(
                    'Pool manager failed to delete a device, removing from DB'
                )

            delete_count += 1
            with db_session() as session:
                session.query(Device).\
                    filter(Device.name == status.result['name']).delete()
                session.commit()

        LOG.info('%d freed devices delete from pool', delete_count)

    def send_vips_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=3600.0
        )
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building vip')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a vip')
                continue

            built_count += 1
            try:
                self._add_vip(status.result)
            except:
                LOG.exception(
                    'Could not add vip to DB, node data: {0}'
                    .format(status.result)
                )
        LOG.info(
            '{vips} vips built and added to pool'.format(vips=built_count)
        )

    def send_create_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending {0} gearman messages".format(len(message)))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=3600.0
        )
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a device')
                if 'name' in status.result:
                    self._add_bad_node(status.result)
                continue

            built_count += 1
            try:
                self._add_node(status.result)
            except:
                LOG.exception(
                    'Could not add node to DB, node data: {0}'
                    .format(status.result)
                )
        LOG.info(
            '{nodes} devices built and added to pool'.format(nodes=built_count)
        )

    def _add_vip(self, data):
        LOG.info('Adding vip {0} to DB'.format(data['ip']))
        vip = Vip()
        vip.ip = int(ipaddress.IPv4Address(unicode(data['ip'])))
        with db_session() as session:
            session.add(vip)
            session.commit()

    def _add_node(self, data):
        LOG.info('Adding device {0} to DB'.format(data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'OFFLINE'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()

    def _add_bad_node(self, data):
        LOG.info(
            'Adding bad device {0} to DB to be deleted'.format(data['name'])
        )
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'DELETED'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()
Exemplo n.º 3
0
class GearJobs(object):
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({
                'host': host,
                'port': int(port),
                'keyfile': cfg.CONF['gearman']['ssl_key'],
                'certfile': cfg.CONF['gearman']['ssl_cert'],
                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                'keepalive': cfg.CONF['gearman']['keepalive'],
                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                'keepidle': cfg.CONF['gearman']['keepidle'],
                'keepintvl': cfg.CONF['gearman']['keepintvl']
            })
        self.gm_client = JSONGearmanClient(server_list)

    def send_pings(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        node_status = dict()
        retry_list = []
        # The message name is STATS for historical reasons. Real
        # data statistics are gathered with METRICS messages.
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                LOG.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                retry_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                if ('status' in ping.result
                        and ping.result['status'] == 'DELETED'):
                    continue
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue
            else:
                if 'nodes' in ping.result:
                    node_status[ping.job.task] = ping.result['nodes']

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info("{0} pings timed out, retrying".format(len(retry_list)))
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_pings = self.gm_client.submit_multiple_jobs(
                list_of_jobs,
                background=False,
                wait_until_complete=True,
                poll_timeout=self.poll_retry)
            for ping in submitted_pings:
                if ping.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error('Gearman Job server fail')
                    continue
                if ping.timed_out:
                    # Ping timeout
                    failed_list.append(ping.job.task)
                    continue
                if ping.result['hpcs_response'] == 'FAIL':
                    if ('status' in ping.result
                            and ping.result['status'] == 'DELETED'):
                        continue
                    # Error returned by Gearman
                    failed_list.append(ping.job.task)
                    continue
                else:
                    if 'nodes' in ping.result:
                        node_status[ping.job.task] = ping.result['nodes']

        return failed_list, node_status

    def offline_check(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "DIAGNOSTICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                LOG.error(
                    "Gearman Job server failed during OFFLINE check of {0}".
                    format(ping.job.task))
            elif ping.timed_out:
                failed_list.append(ping.job.task)
            elif ping.result['network'] == 'FAIL':
                failed_list.append(ping.job.task)
            else:
                gearman_count = 0
                gearman_fail = 0
                for gearman_test in ping.result['gearman']:
                    gearman_count += 1
                    if gearman_test['status'] == 'FAIL':
                        gearman_fail += 1
                # Need 2/3rds gearman up
                max_fail_count = gearman_count / 3
                if gearman_fail > max_fail_count:
                    failed_list.append(ping.job.task)
        return failed_list

    def get_stats(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        retry_list = []
        results = {}
        job_data = {"hpcs_action": "METRICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_stats = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for stats in submitted_stats:
            if stats.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                retry_list.append(stats.job.task)
            elif stats.timed_out:
                # Timeout
                retry_list.append(stats.job.task)
            elif stats.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(stats.job.task)
            else:
                #Success
                results[stats.job.task] = stats.result

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info("{0} Statistics gathering timed out, retrying".format(
                len(retry_list)))
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_stats = self.gm_client.submit_multiple_jobs(
                list_of_jobs,
                background=False,
                wait_until_complete=True,
                poll_timeout=self.poll_retry)
            for stats in submitted_stats:
                if stats.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error("Gearman Job server failed gathering statistics "
                              "on {0}".format(stats.job.task))
                    failed_list.append(stats.job.task)
                elif stats.timed_out:
                    # Timeout
                    failed_list.append(stats.job.task)
                elif stats.result['hpcs_response'] == 'FAIL':
                    # Error returned by Gearman
                    failed_list.append(stats.job.task)
                else:
                    #Success
                    results[stats.job.task] = stats.result

        return failed_list, results
Exemplo n.º 4
0
class GearJobs(object):
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']
                                })
        self.gm_client = JSONGearmanClient(server_list)

    def send_pings(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        node_status = dict()
        retry_list = []
        # The message name is STATS for historical reasons. Real
        # data statistics are gathered with METRICS messages.
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                LOG.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                retry_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                if (
                    'status' in ping.result and
                    ping.result['status'] == 'DELETED'
                ):
                    continue
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue
            else:
                if 'nodes' in ping.result:
                    node_status[ping.job.task] = ping.result['nodes']

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info(
                "{0} pings timed out, retrying".format(len(retry_list))
            )
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_pings = self.gm_client.submit_multiple_jobs(
                list_of_jobs, background=False, wait_until_complete=True,
                poll_timeout=self.poll_retry
            )
            for ping in submitted_pings:
                if ping.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error('Gearman Job server fail')
                    continue
                if ping.timed_out:
                    # Ping timeout
                    failed_list.append(ping.job.task)
                    continue
                if ping.result['hpcs_response'] == 'FAIL':
                    if (
                        'status' in ping.result and
                        ping.result['status'] == 'DELETED'
                    ):
                        continue
                    # Error returned by Gearman
                    failed_list.append(ping.job.task)
                    continue
                else:
                    if 'nodes' in ping.result:
                        node_status[ping.job.task] = ping.result['nodes']

        return failed_list, node_status

    def offline_check(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "DIAGNOSTICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                LOG.error(
                    "Gearman Job server failed during OFFLINE check of {0}".
                    format(ping.job.task)
                )
            elif ping.timed_out:
                failed_list.append(ping.job.task)
            elif ping.result['network'] == 'FAIL':
                failed_list.append(ping.job.task)
            else:
                gearman_count = 0
                gearman_fail = 0
                for gearman_test in ping.result['gearman']:
                    gearman_count += 1
                    if gearman_test['status'] == 'FAIL':
                        gearman_fail += 1
                # Need 2/3rds gearman up
                max_fail_count = gearman_count / 3
                if gearman_fail > max_fail_count:
                    failed_list.append(ping.job.task)
        return failed_list

    def get_stats(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        retry_list = []
        results = {}
        job_data = {"hpcs_action": "METRICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_stats = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for stats in submitted_stats:
            if stats.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                retry_list.append(stats.job.task)
            elif stats.timed_out:
                # Timeout
                retry_list.append(stats.job.task)
            elif stats.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(stats.job.task)
            else:
                #Success
                results[stats.job.task] = stats.result

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info(
                "{0} Statistics gathering timed out, retrying".
                format(len(retry_list))
            )
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_stats = self.gm_client.submit_multiple_jobs(
                list_of_jobs, background=False, wait_until_complete=True,
                poll_timeout=self.poll_retry
            )
            for stats in submitted_stats:
                if stats.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error(
                        "Gearman Job server failed gathering statistics "
                        "on {0}".format(stats.job.task)
                    )
                    failed_list.append(stats.job.task)
                elif stats.timed_out:
                    # Timeout
                    failed_list.append(stats.job.task)
                elif stats.result['hpcs_response'] == 'FAIL':
                    # Error returned by Gearman
                    failed_list.append(stats.job.task)
                else:
                    #Success
                    results[stats.job.task] = stats.result

        return failed_list, results
Exemplo n.º 5
0
class GearmanWork(object):
    def __init__(self):
        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({
                'host': host,
                'port': int(port),
                'keyfile': cfg.CONF['gearman']['ssl_key'],
                'certfile': cfg.CONF['gearman']['ssl_cert'],
                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                'keepalive': cfg.CONF['gearman']['keepalive'],
                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                'keepidle': cfg.CONF['gearman']['keepidle'],
                'keepintvl': cfg.CONF['gearman']['keepintvl']
            })
        self.gearman_client = JSONGearmanClient(server_list)

    def send_delete_message(self, message):
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=30.0)
        delete_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst deleting device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error(
                    'Pool manager failed to delete a device, removing from DB')

            delete_count += 1
            with db_session() as session:
                session.query(Device).\
                    filter(Device.name == status.result['name']).delete()
                session.commit()

        LOG.info('%d freed devices delete from pool', delete_count)

    def send_vips_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=3600.0)
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building vip')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a vip')
                continue

            built_count += 1
            try:
                self._add_vip(status.result)
            except:
                LOG.exception('Could not add vip to DB, node data: {0}'.format(
                    status.result))
        LOG.info(
            '{vips} vips built and added to pool'.format(vips=built_count))

    def send_create_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending {0} gearman messages".format(len(message)))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=3600.0)
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a device')
                if 'name' in status.result:
                    self._add_bad_node(status.result)
                continue

            built_count += 1
            try:
                self._add_node(status.result)
            except:
                LOG.exception(
                    'Could not add node to DB, node data: {0}'.format(
                        status.result))
        LOG.info('{nodes} devices built and added to pool'.format(
            nodes=built_count))

    def _add_vip(self, data):
        LOG.info('Adding vip {0} to DB'.format(data['ip']))
        vip = Vip()
        vip.ip = int(ipaddress.IPv4Address(unicode(data['ip'])))
        with db_session() as session:
            session.add(vip)
            session.commit()

    def _add_node(self, data):
        LOG.info('Adding device {0} to DB'.format(data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'OFFLINE'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()

    def _add_bad_node(self, data):
        LOG.info('Adding bad device {0} to DB to be deleted'.format(
            data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'DELETED'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()