예제 #1
0
파일: virt.py 프로젝트: mcarden/shakenfist
 def reboot(self, hard=False):
     libvirt = util.get_libvirt()
     instance = self._get_domain()
     if not hard:
         instance.reboot(flags=libvirt.VIR_DOMAIN_REBOOT_ACPI_POWER_BTN)
     else:
         instance.reset()
예제 #2
0
    def power_on(self):
        if not os.path.exists(self.xml_file):
            db.enqueue_instance_error(self.db_entry['uuid'],
                                      'missing domain file in power on')

        libvirt = util.get_libvirt()
        with open(self.xml_file) as f:
            xml = f.read()

        instance = self._get_domain()
        if not instance:
            conn = libvirt.open(None)
            instance = conn.defineXML(xml)
            if not instance:
                db.enqueue_instance_error(self.db_entry['uuid'],
                                          'power on failed to create domain')
                raise exceptions.NoDomainException()

        try:
            instance.create()
        except libvirt.libvirtError as e:
            if not str(e).startswith(
                    'Requested operation is not valid: domain is already running'
            ):
                LOG.withObj(self).warning('Instance start error: %s' % e)
                return False

        instance.setAutostart(1)
        db.update_instance_power_state(
            self.db_entry['uuid'], util.extract_power_state(libvirt, instance))
        db.add_event('instance', self.db_entry['uuid'], 'poweron', 'complete',
                     None, None)
        return True
예제 #3
0
    def is_powered_on(self):
        instance = self._get_domain()
        if not instance:
            return 'off'

        libvirt = util.get_libvirt()
        return util.extract_power_state(libvirt, instance)
예제 #4
0
    def run(self):
        workers = []
        LOG.info('Starting Queues')

        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        present_cpus, _, _ = conn.getCPUMap()

        while True:
            try:
                for w in copy.copy(workers):
                    if not w.is_alive():
                        w.join(1)
                        workers.remove(w)

                if len(workers) < present_cpus / 2:
                    jobname, workitem = db.dequeue(config.NODE_NAME)
                else:
                    workitem = None

                if not workitem:
                    time.sleep(0.2)
                    continue

                p = multiprocessing.Process(
                    target=handle, args=(jobname, workitem,),
                    name='%s-worker' % daemon.process_name('queues'))
                p.start()
                workers.append(p)

            except Exception as e:
                util.ignore_exception(daemon.process_name('queues'), e)
예제 #5
0
    def _get_domain(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            return conn.lookupByName('sf:' + self.db_entry['uuid'])

        except libvirt.libvirtError:
            return None
예제 #6
0
def instance_start(instance_uuid, network):
    log = LOG.withField('instance', instance_uuid)

    with db.get_lock('instance',
                     None,
                     instance_uuid,
                     ttl=900,
                     timeout=120,
                     op='Instance start') as lock:
        instance = virt.from_db(instance_uuid)

        # Collect the networks
        nets = {}
        for netdesc in network:
            if netdesc['network_uuid'] not in nets:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_error(instance_uuid, 'missing network')
                    return

                nets[netdesc['network_uuid']] = n

        # Create the networks
        with util.RecordedOperation('ensure networks exist', instance):
            for network_uuid in nets:
                n = nets[network_uuid]
                try:
                    n.create()
                    n.ensure_mesh()
                    n.update_dhcp()
                except exceptions.DeadNetwork as e:
                    log.withField(
                        'network',
                        n).warning('Instance tried to use dead network')
                    db.enqueue_instance_error(
                        instance_uuid, 'tried to use dead network: %s' % e)
                    return

        # Allocate console and VDI ports
        instance.allocate_instance_ports()

        # Now we can start the instance
        libvirt = util.get_libvirt()
        try:
            with util.RecordedOperation('instance creation', instance):
                instance.create(lock=lock)

        except libvirt.libvirtError as e:
            code = e.get_error_code()
            if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED,
                        libvirt.VIR_ERR_XML_ERROR):
                db.enqueue_instance_error(instance_uuid,
                                          'instance failed to start: %s' % e)
                return

        for iface in db.get_instance_interfaces(instance_uuid):
            db.update_network_interface_state(iface['uuid'], 'created')
예제 #7
0
    def unpause(self):
        libvirt = util.get_libvirt()
        instance = self._get_domain()
        instance.resume()
        db.update_instance_power_state(
            self.db_entry['uuid'], util.extract_power_state(libvirt, instance))

        db.add_event('instance', self.db_entry['uuid'], 'unpause', 'complete',
                     None, None)
예제 #8
0
    def reboot(self, hard=False):
        libvirt = util.get_libvirt()
        instance = self._get_domain()
        if not hard:
            instance.reboot(flags=libvirt.VIR_DOMAIN_REBOOT_ACPI_POWER_BTN)
        else:
            instance.reset()

        db.add_event('instance', self.db_entry['uuid'], 'reboot', 'complete',
                     None, None)
예제 #9
0
    def power_off(self):
        libvirt = util.get_libvirt()
        instance = self._get_domain()
        if not instance:
            return

        try:
            instance.destroy()
        except libvirt.libvirtError as e:
            LOG.withObj(self).error('Failed to delete domain: %s' % e)

        self.update_power_state('off')
        self.add_event('poweroff', 'complete')
예제 #10
0
    def power_off(self):
        libvirt = util.get_libvirt()
        instance = self._get_domain()
        if not instance:
            return

        try:
            instance.destroy()
        except libvirt.libvirtError as e:
            logutil.error([self], 'Failed to delete domain: %s' % e)

        db.add_event(
            'instance', self.db_entry['uuid'], 'poweroff', 'complete', None, None)
예제 #11
0
    def power_off(self):
        libvirt = util.get_libvirt()
        instance = self._get_domain()
        if not instance:
            return

        try:
            instance.destroy()
        except libvirt.libvirtError as e:
            LOG.withObj(self).error('Failed to delete domain: %s' % e)

        db.update_instance_power_state(self.db_entry['uuid'], 'off')
        db.add_event('instance', self.db_entry['uuid'], 'poweroff', 'complete',
                     None, None)
예제 #12
0
파일: virt.py 프로젝트: mcarden/shakenfist
    def power_off(self):
        libvirt = util.get_libvirt()
        with open(self.xml_file) as f:
            xml = f.read()

        instance = self._get_domain()
        if not instance:
            conn = libvirt.open(None)
            instance = conn.defineXML(xml)
            if not instance:
                LOG.error('%s: Failed to create libvirt domain' % self)
                return

        try:
            instance.destroy()
        except libvirt.libvirtError as e:
            LOG.error('%s: Failed to delete domain: %s' % (self, e))
예제 #13
0
파일: virt.py 프로젝트: mcarden/shakenfist
    def power_on(self):
        libvirt = util.get_libvirt()
        with open(self.xml_file) as f:
            xml = f.read()

        instance = self._get_domain()
        if not instance:
            conn = libvirt.open(None)
            instance = conn.defineXML(xml)
            if not instance:
                LOG.error('%s: Failed to create libvirt domain' % self)
                return

        try:
            instance.create()
        except libvirt.libvirtError:
            pass

        instance.setAutostart(1)
예제 #14
0
def instance_start(instance_uuid, network):
    with db.get_lock('instance', None, instance_uuid, ttl=900) as lock:
        instance = virt.from_db(instance_uuid)

        # Collect the networks
        nets = {}
        for netdesc in network:
            if netdesc['network_uuid'] not in nets:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                               instance_uuid, 'error',
                                               'missing network')
                    return

                nets[netdesc['network_uuid']] = n

        # Create the networks
        with util.RecordedOperation('ensure networks exist', instance):
            for network_uuid in nets:
                n = nets[network_uuid]
                n.create()
                n.ensure_mesh()
                n.update_dhcp()

        # Now we can start the isntance
        libvirt = util.get_libvirt()
        try:
            with util.RecordedOperation('instance creation', instance):
                instance.create(lock=lock)

        except libvirt.libvirtError as e:
            code = e.get_error_code()
            if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED,
                        libvirt.VIR_ERR_XML_ERROR):
                db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                           instance_uuid, 'error',
                                           'instance failed to start')
                return

        for iface in db.get_instance_interfaces(instance_uuid):
            db.update_network_interface_state(iface['uuid'], 'created')
예제 #15
0
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                instance = db.get_instance(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    logutil.warning([virt.ThinInstance(instance_uuid)],
                                    'Destroying unknown instance')
                    util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                db.place_instance(instance_uuid,
                                  config.parsed.get('NODE_NAME'))
                seen.append(domain.name())

                if instance.get('state') == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance['state_updated'] < 300:
                        continue

                    db.instance_enforced_deletes_increment(instance_uuid)
                    attempts = instance.get('enforced_deletes', 0)

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big hammer instead.
                        logutil.warning(
                            [virt.ThinInstance(instance_uuid)],
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        db.add_event('instance', instance_uuid,
                                     'enforced delete', 'complete', None, None)
                    else:
                        i = virt.from_db(instance_uuid)
                        i.delete()
                        i.update_instance_state('deleted')

                    logutil.warning([virt.ThinInstance(instance_uuid)],
                                    'Deleting stray instance (attempt %d)' %
                                    attempts)

                    continue

                state = util.extract_power_state(libvirt, domain)
                db.update_instance_power_state(instance_uuid, state)
                if state == 'crashed':
                    db.update_instance_state(instance_uuid, 'error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    instance = db.get_instance(instance_uuid)

                    if instance.get('state') == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        logutil.info([virt.ThinInstance(instance_uuid)],
                                     'Detected stray instance')
                        db.add_event('instance', instance_uuid,
                                     'deleted stray', 'complete', None, None)
                        continue

                    db.place_instance(instance_uuid,
                                      config.parsed.get('NODE_NAME'))
                    instance_path = os.path.join(
                        config.parsed.get('STORAGE_PATH'), 'instances',
                        instance_uuid)

                    if not os.path.exists(instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        logutil.info([virt.ThinInstance(instance_uuid)],
                                     'Detected error state for instance')
                        db.update_instance_state(instance_uuid, 'error')
                    elif instance.get('power_state') != 'off':
                        logutil.info([virt.ThinInstance(instance_uuid)],
                                     'Detected power off for instance')
                        db.update_instance_power_state(instance_uuid, 'off')
                        db.add_event('instance', instance_uuid,
                                     'detected poweroff', 'complete', None,
                                     None)

        except libvirt.libvirtError as e:
            logutil.error(None, 'Failed to lookup all domains: %s' % e)
예제 #16
0
def _get_stats():
    libvirt = util.get_libvirt()
    retval = {}
    conn = libvirt.open(None)

    # CPU info
    present_cpus, _, available_cpus = conn.getCPUMap()
    retval.update({
        'cpu_max': present_cpus,
        'cpu_available': available_cpus,
    })

    retval['cpu_max_per_instance'] = conn.getMaxVcpus(None)

    # This is disable as data we don't currently use
    # for i in range(present_cpus):
    #    per_cpu_stats = conn.getCPUStats(i)
    #    for key in per_cpu_stats:
    #        retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key]

    try:
        load_1, load_5, load_15 = psutil.getloadavg()
        retval.update({
            'cpu_load_1': load_1,
            'cpu_load_5': load_5,
            'cpu_load_15': load_15,
        })
    except Exception:
        pass

    # Memory info - libvirt returns memory in KiB
    memory_status = conn.getMemoryStats(
        libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS)
    retval.update({
        'memory_max': memory_status['total'] // 1024,
        'memory_available': memory_status['free'] // 1024,
    })

    # Disk info
    s = os.statvfs(config.parsed.get('STORAGE_PATH'))
    disk_counters = psutil.disk_io_counters()
    retval.update({
        'disk_total': s.f_frsize * s.f_blocks,
        'disk_free': s.f_frsize * s.f_bavail,
        'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree),
        'disk_read_bytes': disk_counters.read_bytes,
        'disk_write_bytes': disk_counters.write_bytes,
    })

    # Network info
    net_counters = psutil.net_io_counters()
    retval.update({
        'network_read_bytes': net_counters.bytes_recv,
        'network_write_bytes': net_counters.bytes_sent,
    })

    # Virtual machine consumption info
    total_instances = 0
    total_active_instances = 0
    total_instance_max_memory = 0
    total_instance_actual_memory = 0
    total_instance_vcpus = 0
    total_instance_cpu_time = 0

    for guest in conn.listAllDomains():
        active = guest.isActive() == 1
        _, maxmem, mem, cpus, cpu_time = guest.info()

        total_instances += 1
        if active:
            total_active_instances += 1

        total_instance_max_memory += maxmem
        total_instance_actual_memory += mem
        total_instance_vcpus += cpus
        total_instance_cpu_time += cpu_time

    retval.update({
        'cpu_total_instance_vcpus':
        total_instance_vcpus,
        'cpu_total_instance_cpu_time':
        total_instance_cpu_time,
        'memory_total_instance_max_memory':
        total_instance_max_memory // 1024,
        'memory_total_instance_actual_memory':
        total_instance_actual_memory // 1024,
        'instances_total':
        total_instances,
        'instances_active':
        total_active_instances,
    })

    return retval
예제 #17
0
 def unpause(self):
     libvirt = util.get_libvirt()
     instance = self._get_domain()
     instance.resume()
     self.update_power_state(util.extract_power_state(libvirt, instance))
     self.add_event('unpause', 'complete')
예제 #18
0
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.withInstance(instance_uuid)

                instance = virt.Instance.from_db(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    util.execute(None,
                                 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                instance.place_instance(config.NODE_NAME)
                seen.append(domain.name())

                if instance.state == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance.state_updated < 300:
                        continue

                    instance.enforced_deletes_increment()
                    attempts = instance.enforced_deletes

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big
                        # hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        instance.add_event('enforced delete', 'complete')
                    else:
                        instance.delete()

                    log_ctx.withField('attempt', attempts).warning(
                        'Deleting stray instance')

                    continue

                state = util.extract_power_state(libvirt, domain)
                instance.update_power_state(state)
                if state == 'crashed':
                    instance.update_instance_state('error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.withInstance(instance_uuid)
                    instance = virt.Instance.from_db(instance_uuid)

                    if not instance:
                        # Instance is SF but not in database. Kill because
                        # unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        continue

                    if instance.state == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        log_ctx.info('Detected stray instance')
                        instance.add_event('deleted stray', 'complete')
                        continue

                    instance.place_instance(config.NODE_NAME)

                    if not os.path.exists(instance.instance_path()):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        instance.update_instance_state('error')

                    elif instance.power_state != 'off':
                        log_ctx.info('Detected power off for instance')
                        instance.update_power_state('off')
                        instance.add_event('detected poweroff', 'complete')

        except libvirt.libvirtError as e:
            LOG.error('Failed to lookup all domains: %s' % e)
예제 #19
0
def _get_stats():
    libvirt = util.get_libvirt()
    retval = {}
    conn = libvirt.open(None)

    # CPU info
    present_cpus, _, available_cpus = conn.getCPUMap()
    retval.update({
        'cpu_max': present_cpus,
        'cpu_available': available_cpus,
    })

    retval['cpu_max_per_instance'] = conn.getMaxVcpus(None)

    # This is disabled as data we don't currently use
    # for i in range(present_cpus):
    #    per_cpu_stats = conn.getCPUStats(i)
    #    for key in per_cpu_stats:
    #        retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key]

    try:
        load_1, load_5, load_15 = psutil.getloadavg()
        retval.update({
            'cpu_load_1': load_1,
            'cpu_load_5': load_5,
            'cpu_load_15': load_15,
        })
    except Exception as e:
        util.ignore_exception('load average', e)

    # System memory info, converting bytes to mb
    stats = psutil.virtual_memory()
    retval.update({
        'memory_max': stats.total // 1024 // 1024,
        'memory_available': stats.available // 1024 // 1024
    })

    # libvirt memory info, converting kb to mb
    memory_status = conn.getMemoryStats(
        libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS)
    retval.update({
        'memory_max_libvirt': memory_status['total'] // 1024,
        'memory_available_libvirt': memory_status['free'] // 1024,
    })

    # Kernel Shared Memory (KSM) information
    ksm_details = {}
    for ent in os.listdir('/sys/kernel/mm/ksm'):
        with open('/sys/kernel/mm/ksm/%s' % ent) as f:
            ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip())
    retval.update(ksm_details)

    # Disk info
    s = os.statvfs(config.get('STORAGE_PATH'))
    disk_counters = psutil.disk_io_counters()
    retval.update({
        'disk_total': s.f_frsize * s.f_blocks,
        'disk_free': s.f_frsize * s.f_bavail,
        'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree),
        'disk_read_bytes': disk_counters.read_bytes,
        'disk_write_bytes': disk_counters.write_bytes,
    })

    # Network info
    net_counters = psutil.net_io_counters()
    retval.update({
        'network_read_bytes': net_counters.bytes_recv,
        'network_write_bytes': net_counters.bytes_sent,
    })

    # Virtual machine consumption info
    total_instances = 0
    total_active_instances = 0
    total_instance_max_memory = 0
    total_instance_actual_memory = 0
    total_instance_vcpus = 0
    total_instance_cpu_time = 0

    for guest in conn.listAllDomains():
        try:
            active = guest.isActive() == 1
            if active:
                _, maxmem, mem, cpus, cpu_time = guest.info()

        except libvirt.libvirtError as e:
            LOG.debug('During resource calc ignored libvirt error: %s' % e)
            active = False

        if active:
            total_instances += 1
            total_active_instances += 1
            total_instance_max_memory += maxmem
            total_instance_actual_memory += mem
            total_instance_vcpus += cpus
            total_instance_cpu_time += cpu_time

    # Queue health statistics
    node_queue_processing, node_queue_waiting = db.get_queue_length(
        config.NODE_NAME)

    retval.update({
        'cpu_total_instance_vcpus':
        total_instance_vcpus,
        'cpu_total_instance_cpu_time':
        total_instance_cpu_time,
        'memory_total_instance_max':
        total_instance_max_memory // 1024,
        'memory_total_instance_actual':
        total_instance_actual_memory // 1024,
        'instances_total':
        total_instances,
        'instances_active':
        total_active_instances,
        'node_queue_processing':
        node_queue_processing,
        'node_queue_waiting':
        node_queue_waiting,
    })

    if util.is_network_node():
        network_queue_processing, network_queue_waiting = db.get_queue_length(
            'networknode')

        retval.update({
            'network_queue_processing': network_queue_processing,
            'network_queue_waiting': network_queue_waiting,
        })

    return retval