Exemplo n.º 1
0
    def is_powered_on(self):
        inst = self._get_domain()
        if not inst:
            return 'off'

        libvirt = util_libvirt.get_libvirt()
        return util_libvirt.extract_power_state(libvirt, inst)
Exemplo n.º 2
0
 def unpause(self):
     libvirt = util_libvirt.get_libvirt()
     inst = self._get_domain()
     inst.resume()
     self.update_power_state(util_libvirt.extract_power_state(
         libvirt, inst))
     self.add_event('unpause', 'complete')
Exemplo n.º 3
0
    def __init__(self, name):
        super(WorkerPoolDaemon, self).__init__(name)
        self.workers = []

        libvirt = util_libvirt.get_libvirt()
        conn = libvirt.open('qemu:///system')
        self.present_cpus, _, _ = conn.getCPUMap()
Exemplo n.º 4
0
    def _get_domain(self):
        libvirt = util_libvirt.get_libvirt()
        conn = libvirt.open('qemu:///system')
        try:
            return conn.lookupByName('sf:' + self.uuid)

        except libvirt.libvirtError:
            return None
Exemplo n.º 5
0
 def reboot(self, hard=False):
     libvirt = util_libvirt.get_libvirt()
     inst = self._get_domain()
     if not hard:
         inst.reboot(flags=libvirt.VIR_DOMAIN_REBOOT_ACPI_POWER_BTN)
     else:
         inst.reset()
     self.add_event('reboot', 'complete')
Exemplo n.º 6
0
    def power_off(self):
        libvirt = util_libvirt.get_libvirt()
        inst = self._get_domain()
        if not inst:
            return

        try:
            inst.destroy()
        except libvirt.libvirtError as e:
            if not str(e).startswith('Requested operation is not valid: '
                                     'domain is not running'):
                self.log.error('Failed to delete domain: %s', e)

        self.update_power_state('off')
        self.add_event('poweroff', 'complete')
Exemplo n.º 7
0
    def run(self):
        LOG.info('Starting')

        libvirt = util_libvirt.get_libvirt()
        conn = libvirt.open('qemu:///system')
        present_cpus, _, _ = conn.getCPUMap()

        os.makedirs('/var/run/sf', exist_ok=True)
        util_process.execute(None, (config.API_COMMAND_LINE % {
            'port': config.API_PORT,
            'timeout': config.API_TIMEOUT,
            'name': daemon.process_name('api'),
            'workers': present_cpus * 4
        }),
                             env_variables=os.environ,
                             check_exit_code=[0, 1, -15])
Exemplo n.º 8
0
    def power_on(self):
        libvirt = util_libvirt.get_libvirt()
        inst = self._get_domain()
        if not inst:
            conn = libvirt.open('qemu:///system')
            inst = conn.defineXML(self._create_domain_xml())
            if not inst:
                self.enqueue_delete_due_error(
                    'power on failed to create domain')
                raise exceptions.NoDomainException()

        try:
            inst.create()
        except libvirt.libvirtError as e:
            if str(e).startswith('Requested operation is not valid: '
                                 'domain is already running'):
                pass
            elif str(e).find('Failed to find an available port: '
                             'Address already in use') != -1:
                self.log.warning('Instance ports clash: %s', e)

                # Free those ports and pick some new ones
                ports = self.ports
                self._free_console_port(ports['console_port'])
                self._free_console_port(ports['vdi_port'])

                # We need to delete the nvram file before we can undefine
                # the domain. This will be recreated by libvirt on the next
                # attempt.
                nvram_path = os.path.join(self.instance_path, 'nvram')
                if os.path.exists(nvram_path):
                    os.unlink(nvram_path)

                inst.undefine()

                self.ports = None
                self.allocate_instance_ports()
                return False
            else:
                self.log.warning('Instance start error: %s', e)
                return False

        inst.setAutostart(1)
        self.update_power_state(util_libvirt.extract_power_state(
            libvirt, inst))
        self.add_event('poweron', 'complete')
        return True
Exemplo n.º 9
0
    def _update_power_states(self):
        libvirt = util_libvirt.get_libvirt()
        conn = libvirt.open('qemu:///system')
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.with_instance(instance_uuid)

                inst = instance.Instance.from_db(instance_uuid)
                if not inst:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    self._delete_instance_files(instance_uuid)
                    util_process.execute(
                        None, 'virsh destroy "sf:%s"' % instance_uuid)
                    util_process.execute(
                        None, 'virsh undefine --nvram "sf:%s"' % instance_uuid)
                    continue

                inst.place_instance(config.NODE_NAME)
                seen.append(domain.name())

                db_state = inst.state
                if db_state.value == dbo.STATE_DELETED:
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - db_state.update_time < 300:
                        continue

                    inst.enforced_deletes_increment()
                    attempts = inst._db_get_attribute(
                        'enforced_deletes')['count']

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big
                        # hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        self._delete_instance_files(instance_uuid)
                        util_process.execute(
                            None,
                            'virsh undefine --nvram "sf:%s"' % instance_uuid)
                        inst.add_event('enforced delete', 'complete')
                    else:
                        inst.delete()

                    log_ctx.with_field(
                        'attempt', attempts).warning('Deleting stray instance')
                    continue

                state = util_libvirt.extract_power_state(libvirt, domain)
                inst.update_power_state(state)
                if state == 'crashed':
                    if inst.state.value in [
                            dbo.STATE_DELETE_WAIT, dbo.STATE_DELETED
                    ]:
                        util_process.execute(
                            None,
                            'virsh undefine --nvram "sf:%s"' % instance_uuid)
                        inst.state.value = dbo.STATE_DELETED
                    else:
                        inst.state = inst.state.value + '-error'

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.with_instance(instance_uuid)
                    inst = instance.Instance.from_db(instance_uuid)

                    if not inst:
                        # Instance is SF but not in database. Kill because
                        # unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        self._delete_instance_files(instance_uuid)
                        try:
                            domain = conn.lookupByName(domain_name)
                            # TODO(mikal): work out if we can pass
                            # VIR_DOMAIN_UNDEFINE_NVRAM with virDomainUndefineFlags()
                            domain.undefine()
                        except libvirt.libvirtError:
                            util_process.execute(
                                None, 'virsh undefine --nvram "sf:%s"' %
                                instance_uuid)
                        continue

                    db_state = inst.state
                    if db_state.value in [
                            dbo.STATE_DELETE_WAIT, dbo.STATE_DELETED
                    ]:
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - db_state.update_time < 300:
                            continue

                        log_ctx.info('Detected stray instance')
                        self._delete_instance_files(instance_uuid)
                        try:
                            domain = conn.lookupByName(domain_name)
                            # TODO(mikal): work out if we can pass
                            # VIR_DOMAIN_UNDEFINE_NVRAM with virDomainUndefineFlags()
                            domain.undefine()
                        except libvirt.libvirtError:
                            util_process.execute(
                                None, 'virsh undefine --nvram "sf:%s"' %
                                instance_uuid)

                        inst.add_event('deleted stray', 'complete')
                        if db_state.value != dbo.STATE_DELETED:
                            inst.state.value = dbo.STATE_DELETED
                        continue

                    inst.place_instance(config.NODE_NAME)

                    db_power = inst.power_state
                    if not os.path.exists(inst.instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        if inst.state.value in [
                                dbo.STATE_DELETE_WAIT, dbo.STATE_DELETED
                        ]:
                            inst.state.value = dbo.STATE_DELETED
                        else:
                            inst.state = inst.state.value + '-error'

                    elif not db_power or db_power['power_state'] != 'off':
                        log_ctx.info('Detected power off for instance')
                        inst.update_power_state('off')
                        inst.add_event('detected poweroff', 'complete')

        except libvirt.libvirtError as e:
            LOG.debug('Failed to lookup all domains: %s' % e)
Exemplo n.º 10
0
def handle(jobname, workitem):
    libvirt = util_libvirt.get_libvirt()

    log = LOG.with_field('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    inst = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if InstanceTask.__subclasscheck__(type(task)):
                inst = instance.Instance.from_db(task.instance_uuid())
                if not inst:
                    raise exceptions.InstanceNotInDBException(
                        task.instance_uuid())

            if isinstance(task, FetchImageTask):
                inst = instance.Instance.from_db(task.instance_uuid())

            if isinstance(task, SnapshotTask):
                inst = instance.Instance.from_db(task.instance_uuid())

            if inst:
                log_i = log.with_instance(inst)
            else:
                log_i = log

            log_i.with_field('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if inst:
                # TODO(andy) move to QueueTask
                db.add_event('instance', inst.uuid, task.pretty_task_name(),
                             'dequeued', None, 'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), inst)

            elif isinstance(task, PreflightInstanceTask):
                if (inst.state.value == dbo.STATE_DELETED
                        or inst.state.value.endswith('-error')):
                    log_i.warning(
                        'You cannot preflight an instance in state %s, skipping task'
                        % inst.state.value)
                    continue

                redirect_to = instance_preflight(inst, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    etcd.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                if (inst.state.value == dbo.STATE_DELETED
                        or inst.state.value.endswith('-error')):
                    log_i.warning(
                        'You cannot start an instance in state %s, skipping task'
                        % inst.state.value)
                    continue

                instance_start(inst, task.network())
                etcd.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(inst)
                    etcd.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util_general.ignore_exception(
                        'instance %s delete task' % inst, e)

            elif isinstance(task, FloatNetworkInterfaceTask):
                # Just punt it to the network node now that the interface is ready
                etcd.enqueue('networknode', task)

            elif isinstance(task, SnapshotTask):
                snapshot(inst, task.disk(), task.artifact_uuid(),
                         task.blob_uuid())

            elif isinstance(task, DeleteNetworkWhenClean):
                # Check if any interfaces remain on network
                task_network = net.Network.from_db(task.network_uuid())
                ifaces = networkinterface.interfaces_for_network(task_network)
                cur_interfaces = {i.uuid: i for i in ifaces}

                if cur_interfaces:
                    LOG.with_network(task_network).error(
                        'During DeleteNetworkWhenClean new interfaces have '
                        'connected to network: %s', cur_interfaces)

                # Only check those present at delete task initiation time.
                remain_interfaces = list(
                    set(task.wait_interfaces()) & set(cur_interfaces))
                if remain_interfaces:
                    # Queue task on a node with a remaining instance
                    first_iface = cur_interfaces[remain_interfaces[0]]
                    inst = instance.Instance.from_db(first_iface.instance_uuid)
                    etcd.enqueue(inst.placement['node'], {
                        'tasks': [
                            DeleteNetworkWhenClean(task.network_uuid(),
                                                   remain_interfaces)
                        ]
                    },
                                 delay=60)

                else:
                    # All original instances deleted, safe to delete network
                    etcd.enqueue('networknode',
                                 DestroyNetworkTask(task.network_uuid()))

            elif isinstance(task, HypervisorDestroyNetworkTask):
                n = net.Network.from_db(task.network_uuid())
                n.delete_on_hypervisor()

            elif isinstance(task, FetchBlobTask):
                metrics = etcd.get('metrics', config.NODE_NAME, None)
                if metrics:
                    metrics = metrics.get('metrics', {})
                else:
                    metrics = {}

                b = blob.Blob.from_db(task.blob_uuid())
                if not b:
                    log.with_fields({
                        'blob': task.blob_uuid()
                    }).info('Cannot replicate blob, not found')

                elif (int(metrics.get('disk_free_blobs', 0)) - int(b.size) <
                      config.MINIMUM_FREE_DISK):
                    log.with_fields({
                        'blob': task.blob_uuid()
                    }).info('Cannot replicate blob, insufficient space')

                else:
                    log.with_object(b).info('Replicating blob')
                    size = b.ensure_local([])
                    log.with_object(b).with_fields({
                        'transferred': size,
                        'expected': b.size
                    }).info('Replicating blob complete')

            else:
                log_i.with_field('task',
                                 task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Image fetch failed: %s' % e)

    except exceptions.ImagesCannotShrinkException as e:
        log.info('Fetch Resize Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Image resize failed: %s' % e)

    except libvirt.libvirtError as e:
        log.info('Libvirt Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Instance task failed: %s' % e)

    except exceptions.InstanceException as e:
        log.info('Instance Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Instance task failed: %s' % e)

    except Exception as e:
        # Logging ignored exception - this should be investigated
        util_general.ignore_exception('queue worker', e)
        if inst:
            inst.enqueue_delete_due_error('Failed queue task: %s' % e)

    finally:
        etcd.resolve(config.NODE_NAME, jobname)
        if inst:
            inst.add_event('tasks complete',
                           'dequeued',
                           msg='Work item %s' % jobname)
        log.info('Completed workitem')
Exemplo n.º 11
0
def _get_stats():
    libvirt = util_libvirt.get_libvirt()
    conn = libvirt.open('qemu:///system')

    # What's special about this node?
    retval = {
        'is_etcd_master': config.NODE_IS_ETCD_MASTER,
        'is_hypervisor': config.NODE_IS_HYPERVISOR,
        'is_network_node': config.NODE_IS_NETWORK_NODE,
    }

    # CPU info
    present_cpus, _, available_cpus = conn.getCPUMap()
    retval.update({
        'cpu_max': present_cpus,
        'cpu_available': available_cpus,
    })

    retval['cpu_max_per_instance'] = conn.getMaxVcpus(None)

    # This is disabled as data we don't currently use
    # for i in range(present_cpus):
    #    per_cpu_stats = conn.getCPUStats(i)
    #    for key in per_cpu_stats:
    #        retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key]

    try:
        load_1, load_5, load_15 = psutil.getloadavg()
        retval.update({
            'cpu_load_1': load_1,
            'cpu_load_5': load_5,
            'cpu_load_15': load_15,
        })
    except Exception as e:
        util_general.ignore_exception('load average', e)

    # System memory info, converting bytes to mb
    stats = psutil.virtual_memory()
    retval.update({
        'memory_max': stats.total // 1024 // 1024,
        'memory_available': stats.available // 1024 // 1024
    })

    # libvirt memory info, converting kb to mb
    memory_status = conn.getMemoryStats(
        libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS)
    retval.update({
        'memory_max_libvirt': memory_status['total'] // 1024,
        'memory_available_libvirt': memory_status['free'] // 1024,
    })

    # Kernel Shared Memory (KSM) information
    ksm_details = {}
    for ent in os.listdir('/sys/kernel/mm/ksm'):
        with open('/sys/kernel/mm/ksm/%s' % ent) as f:
            ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip())
    retval.update(ksm_details)

    # Disk info. There could be more than one filesystem here, so we track
    # all of the paths we're fond of.
    fsids = []
    minimum = -1
    total = 0
    used = 0

    for path in ['', 'blobs', 'image_cache', 'instances', 'uploads']:
        # We need to make the paths we check if they don't exist, otherwise
        # they wont be included in the metrics and things get confused.
        fullpath = os.path.join(config.STORAGE_PATH, path)
        os.makedirs(fullpath, exist_ok=True)
        s = os.statvfs(fullpath)
        free = s.f_frsize * s.f_bavail

        if s.f_fsid not in fsids:
            total += s.f_frsize * s.f_blocks
            used += s.f_frsize * (s.f_blocks - s.f_bfree)
            if minimum == -1 or free < minimum:
                minimum = free

        if path == '':
            path = 'sfroot'
        retval['disk_free_%s' % path] = free

    retval.update({
        'disk_total': total,
        'disk_free': minimum,
        'disk_used': used
    })

    disk_counters = psutil.disk_io_counters()
    retval.update({
        'disk_read_bytes': disk_counters.read_bytes,
        'disk_write_bytes': disk_counters.write_bytes,
    })

    # Network info
    net_counters = psutil.net_io_counters()
    retval.update({
        'network_read_bytes': net_counters.bytes_recv,
        'network_write_bytes': net_counters.bytes_sent,
    })

    # Virtual machine consumption info
    total_instances = 0
    total_active_instances = 0
    total_instance_max_memory = 0
    total_instance_actual_memory = 0
    total_instance_vcpus = 0
    total_instance_cpu_time = 0

    for guest in conn.listAllDomains():
        try:
            active = guest.isActive() == 1
            if active:
                _, maxmem, mem, cpus, cpu_time = guest.info()

        except libvirt.libvirtError as e:
            LOG.debug('During resource calc ignored libvirt error: %s' % e)
            active = False

        if active:
            total_instances += 1
            total_active_instances += 1
            total_instance_max_memory += maxmem
            total_instance_actual_memory += mem
            total_instance_vcpus += cpus
            total_instance_cpu_time += cpu_time

    # Queue health statistics
    node_queue_processing, node_queue_waiting = etcd.get_queue_length(
        config.NODE_NAME)

    retval.update({
        'cpu_total_instance_vcpus':
        total_instance_vcpus,
        'cpu_total_instance_cpu_time':
        total_instance_cpu_time,
        'memory_total_instance_max':
        total_instance_max_memory // 1024,
        'memory_total_instance_actual':
        total_instance_actual_memory // 1024,
        'instances_total':
        total_instances,
        'instances_active':
        total_active_instances,
        'node_queue_processing':
        node_queue_processing,
        'node_queue_waiting':
        node_queue_waiting,
    })

    if config.NODE_IS_NETWORK_NODE:
        network_queue_processing, network_queue_waiting = etcd.get_queue_length(
            'networknode')

        retval.update({
            'network_queue_processing': network_queue_processing,
            'network_queue_waiting': network_queue_waiting,
        })

    return retval