Пример #1
0
    def parse(self):
        global CONFIG_DEFAULTS

        node_name = socket.getfqdn()
        try:
            node_ip = socket.gethostbyname(node_name)
        except Exception as e:
            # Only for localhost development environments
            node_ip = '127.0.0.1'
            util.ignore_exception('config parser', e)

        CONFIG_DEFAULTS['NODE_NAME'] = node_name
        CONFIG_DEFAULTS['NODE_IP'] = node_ip
        CONFIG_DEFAULTS['NETWORK_NODE_IP'] = node_ip

        self.config = copy.copy(CONFIG_DEFAULTS)
        for var in os.environ:
            if var.startswith('SHAKENFIST_'):
                flag = var.replace('SHAKENFIST_', '')
                value = os.environ[var]

                # We use the type of the default value to determine
                # what type we should force the value provided by an
                # environment variable into.
                if isinstance(CONFIG_DEFAULTS[flag], int):
                    value = int(value)
                elif isinstance(CONFIG_DEFAULTS[flag], float):
                    value = float(value)
                elif not isinstance(CONFIG_DEFAULTS[flag], str):
                    raise exceptions.FlagException(
                        'Flag %s has unknown type.' % flag)

                self.config[flag] = value
Пример #2
0
    def delete(self):
        with util.RecordedOperation('delete domain', self):
            try:
                self.power_off()

                instance = self._get_domain()
                if instance:
                    instance.undefine()
            except Exception as e:
                util.ignore_exception('instance delete', e)

        with util.RecordedOperation('delete disks', self):
            try:
                if os.path.exists(self.instance_path):
                    shutil.rmtree(self.instance_path)
            except Exception as e:
                util.ignore_exception('instance delete', e)

        with util.RecordedOperation('release network addresses', self):
            for ni in db.get_instance_interfaces(self.db_entry['uuid']):
                db.update_network_interface_state(ni['uuid'], 'deleted')
                with db.get_lock('ipmanager',
                                 None,
                                 ni['network_uuid'],
                                 ttl=120,
                                 op='Instance delete'):
                    ipm = db.get_ipmanager(ni['network_uuid'])
                    ipm.release(ni['ipv4'])
                    db.persist_ipmanager(ni['network_uuid'], ipm.save())

        db.free_console_port(self.db_entry['console_port'])
        db.free_console_port(self.db_entry['vdi_port'])
Пример #3
0
    def run(self):
        workers = []
        LOG.info('Starting Queues')

        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        present_cpus, _, _ = conn.getCPUMap()

        while True:
            try:
                for w in copy.copy(workers):
                    if not w.is_alive():
                        w.join(1)
                        workers.remove(w)

                if len(workers) < present_cpus / 2:
                    jobname, workitem = db.dequeue(config.NODE_NAME)
                else:
                    workitem = None

                if not workitem:
                    time.sleep(0.2)
                    continue

                p = multiprocessing.Process(
                    target=handle, args=(jobname, workitem,),
                    name='%s-worker' % daemon.process_name('queues'))
                p.start()
                workers.append(p)

            except Exception as e:
                util.ignore_exception(daemon.process_name('queues'), e)
Пример #4
0
    def _compact_etcd(self):
        try:
            # We need to determine what revision to compact to, so we keep a
            # key which stores when we last compacted and we use it's latest
            # revision number as the revision to compact to. Note that we use
            # a different library for compaction as our primary library does
            # not support it.
            c = etcd3.client()
            c.put('/sf/compact', json.dumps({'compacted_at': time.time()}))
            _, kv = c.get('/sf/compact')
            c.compact(kv.mod_revision, physical=True)
            c.defragment()
            LOG.info('Compacted etcd')

        except Exception as e:
            util.ignore_exception('etcd compaction', e)
Пример #5
0
def restore_instances():
    # Ensure all instances for this node are defined
    networks = []
    instances = []
    for inst in list(
            db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
        for iface in db.get_instance_interfaces(inst['uuid']):
            if not iface['network_uuid'] in networks:
                networks.append(iface['network_uuid'])
        instances.append(inst['uuid'])

    with util.RecordedOperation('restore networks', None):
        for network in networks:
            try:
                n = net.from_db(network)
                LOG.withObj(n).info('Restoring network')
                n.create()
                n.ensure_mesh()
                n.update_dhcp()
            except Exception as e:
                util.ignore_exception('restore network %s' % network, e)

    with util.RecordedOperation('restore instances', None):
        for instance in instances:
            try:
                with db.get_lock('instance',
                                 None,
                                 instance,
                                 ttl=120,
                                 timeout=120,
                                 op='Instance restore'):
                    i = virt.from_db(instance)
                    if not i:
                        continue
                    started = ['on', 'transition-to-on', 'initial', 'unknown']
                    if i.db_entry.get('power_state', 'unknown') not in started:
                        continue

                    LOG.withObj(i).info('Restoring instance')
                    i.create()
            except Exception as e:
                util.ignore_exception('restore instance %s' % instance, e)
                db.enqueue_instance_error(
                    instance,
                    'exception while restoring instance on daemon restart')
Пример #6
0
    def run(self):
        logutil.info(None, 'Starting')
        gauges = {
            'updated_at': Gauge('updated_at', 'The last time metrics were updated')
        }

        last_metrics = 0

        def update_metrics():
            global last_metrics

            stats = _get_stats()
            for metric in stats:
                if metric not in gauges:
                    gauges[metric] = Gauge(metric, '')
                gauges[metric].set(stats[metric])

            db.update_metrics_bulk(stats)
            logutil.info(None, 'Updated metrics')
            gauges['updated_at'].set_to_current_time()

        while True:
            try:
                jobname, _ = db.dequeue(
                    '%s-metrics' % config.parsed.get('NODE_NAME'))
                if jobname:
                    if time.time() - last_metrics > 2:
                        update_metrics()
                        last_metrics = time.time()
                    db.resolve('%s-metrics' % config.parsed.get('NODE_NAME'),
                               jobname)
                else:
                    time.sleep(0.2)

                if time.time() - last_metrics > config.parsed.get('SCHEDULER_CACHE_TIMEOUT'):
                    update_metrics()
                    last_metrics = time.time()

            except Exception as e:
                util.ignore_exception('resource statistics', e)
Пример #7
0
def _get_stats():
    libvirt = util.get_libvirt()
    retval = {}
    conn = libvirt.open(None)

    # CPU info
    present_cpus, _, available_cpus = conn.getCPUMap()
    retval.update({
        'cpu_max': present_cpus,
        'cpu_available': available_cpus,
    })

    retval['cpu_max_per_instance'] = conn.getMaxVcpus(None)

    # This is disabled as data we don't currently use
    # for i in range(present_cpus):
    #    per_cpu_stats = conn.getCPUStats(i)
    #    for key in per_cpu_stats:
    #        retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key]

    try:
        load_1, load_5, load_15 = psutil.getloadavg()
        retval.update({
            'cpu_load_1': load_1,
            'cpu_load_5': load_5,
            'cpu_load_15': load_15,
        })
    except Exception as e:
        util.ignore_exception('load average', e)

    # System memory info, converting bytes to mb
    stats = psutil.virtual_memory()
    retval.update({
        'memory_max': stats.total // 1024 // 1024,
        'memory_available': stats.available // 1024 // 1024
    })

    # libvirt memory info, converting kb to mb
    memory_status = conn.getMemoryStats(
        libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS)
    retval.update({
        'memory_max_libvirt': memory_status['total'] // 1024,
        'memory_available_libvirt': memory_status['free'] // 1024,
    })

    # Kernel Shared Memory (KSM) information
    ksm_details = {}
    for ent in os.listdir('/sys/kernel/mm/ksm'):
        with open('/sys/kernel/mm/ksm/%s' % ent) as f:
            ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip())
    retval.update(ksm_details)

    # Disk info
    s = os.statvfs(config.get('STORAGE_PATH'))
    disk_counters = psutil.disk_io_counters()
    retval.update({
        'disk_total': s.f_frsize * s.f_blocks,
        'disk_free': s.f_frsize * s.f_bavail,
        'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree),
        'disk_read_bytes': disk_counters.read_bytes,
        'disk_write_bytes': disk_counters.write_bytes,
    })

    # Network info
    net_counters = psutil.net_io_counters()
    retval.update({
        'network_read_bytes': net_counters.bytes_recv,
        'network_write_bytes': net_counters.bytes_sent,
    })

    # Virtual machine consumption info
    total_instances = 0
    total_active_instances = 0
    total_instance_max_memory = 0
    total_instance_actual_memory = 0
    total_instance_vcpus = 0
    total_instance_cpu_time = 0

    for guest in conn.listAllDomains():
        try:
            active = guest.isActive() == 1
            if active:
                _, maxmem, mem, cpus, cpu_time = guest.info()

        except libvirt.libvirtError as e:
            LOG.debug('During resource calc ignored libvirt error: %s' % e)
            active = False

        if active:
            total_instances += 1
            total_active_instances += 1
            total_instance_max_memory += maxmem
            total_instance_actual_memory += mem
            total_instance_vcpus += cpus
            total_instance_cpu_time += cpu_time

    # Queue health statistics
    node_queue_processing, node_queue_waiting = db.get_queue_length(
        config.NODE_NAME)

    retval.update({
        'cpu_total_instance_vcpus':
        total_instance_vcpus,
        'cpu_total_instance_cpu_time':
        total_instance_cpu_time,
        'memory_total_instance_max':
        total_instance_max_memory // 1024,
        'memory_total_instance_actual':
        total_instance_actual_memory // 1024,
        'instances_total':
        total_instances,
        'instances_active':
        total_active_instances,
        'node_queue_processing':
        node_queue_processing,
        'node_queue_waiting':
        node_queue_waiting,
    })

    if util.is_network_node():
        network_queue_processing, network_queue_waiting = db.get_queue_length(
            'networknode')

        retval.update({
            'network_queue_processing': network_queue_processing,
            'network_queue_waiting': network_queue_waiting,
        })

    return retval
Пример #8
0
def handle(jobname, workitem):
    log = LOG.withField('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if (InstanceTask.__subclasscheck__(type(task))
                    or isinstance(task, FetchImageTask)):
                instance_uuid = task.instance_uuid()

            if instance_uuid:
                log_i = log.withInstance(instance_uuid)
            else:
                log_i = log

            log_i.withField('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if instance_uuid:
                # TODO(andy) move to QueueTask
                db.add_event('instance', instance_uuid,
                             task.pretty_task_name(), 'dequeued', None,
                             'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), instance_uuid)

            elif isinstance(task, PreflightInstanceTask):
                redirect_to = instance_preflight(instance_uuid, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                instance_start(instance_uuid, task.network())
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'deleted')
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            elif isinstance(task, ErrorInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'error')

                    if task.error_msg():
                        db.update_instance_error_message(
                            instance_uuid, task.error_msg())
                    db.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            else:
                log_i.withField('task', task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    except Exception as e:
        util.ignore_exception(daemon.process_name('queues'), e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    finally:
        db.resolve(config.NODE_NAME, jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        log.info('Completed workitem')
Пример #9
0
def handle(jobname, workitem):
    j = JobName(jobname)
    logutil.info([j], 'Processing workitem')
    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            ro = [j]
            instance_uuid = task.get('instance_uuid')
            if instance_uuid:
                i = virt.from_db(instance_uuid)
                ro.append(i)

            if task.get('type').startswith('instance_') and not instance_uuid:
                logutil.error(ro, 'Instance task lacks instance uuid')
                return

            if instance_uuid:
                db.add_event('instance', instance_uuid,
                             task.get('type').replace('_', ' '), 'dequeued',
                             None, 'Work item %s' % jobname)

            logutil.info(
                ro,
                'Executing task %s: %s' % (task.get('type', 'unknown'), task))
            if task.get('type') == 'image_fetch':
                image_fetch(task.get('url'), instance_uuid)

            if task.get('type') == 'instance_preflight':
                redirect_to = instance_preflight(instance_uuid,
                                                 task.get('network'))
                if redirect_to:
                    util.log('info', ro,
                             'Redirecting instance start to %s' % redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            if task.get('type') == 'instance_start':
                instance_start(instance_uuid, task.get('network'))
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {})

            if task.get('type') == 'instance_delete':
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid,
                                             task.get('next_state', 'unknown'))
                    if task.get('next_state_message'):
                        db.update_instance_error_message(
                            instance_uuid, task.get('next_state_message'))
                    db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'),
                               {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

    except Exception as e:
        if instance_uuid:
            util.ignore_exception(daemon.process_name('queues'), e)
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'failed queue task: %s' % e)

    finally:
        db.resolve(config.parsed.get('NODE_NAME'), jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        logutil.info([j], 'Completed workitem')