예제 #1
0
def allocate_console_port(instance_uuid):
    node = config.NODE_NAME
    consumed = {value['port'] for value in etcd.get_all('console', node)}
    while True:
        port = random.randint(30000, 50000)
        # avoid hitting etcd if it's probably in use
        if port in consumed:
            continue
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            # We hold this port open until it's in etcd to prevent
            # anyone else needing to hit etcd to find out they can't
            # use it as well as to verify we can use it
            s.bind(('0.0.0.0', port))
            allocatedPort = etcd.create('console', node, port, {
                'instance_uuid': instance_uuid,
                'port': port,
            })
            if allocatedPort:
                return port
        except socket.error as e:
            LOG.withField('instance', instance_uuid).info(
                "Exception during port allocation: %s" % e)
        finally:
            s.close()
예제 #2
0
 def __iter__(self):
     for _, n in etcd.get_all('network', None):
         if n['uuid'] == 'floating':
             continue
         out = self.apply_filters(Network(n))
         if out:
             yield out
예제 #3
0
def get_network_interfaces(network_uuid):
    see_this_node()
    for ni in etcd.get_all('networkinterface', None):
        if ni['state'] == 'deleted':
            continue
        if ni['network_uuid'] == network_uuid:
            yield ni
예제 #4
0
 def _allocate_console_port(self):
     node = config.NODE_NAME
     consumed = [
         value['port'] for _, value in etcd.get_all('console', node)
     ]
     while True:
         port = random.randint(30000, 50000)
         # avoid hitting etcd if it's probably in use
         if port in consumed:
             continue
         s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
         try:
             # We hold this port open until it's in etcd to prevent
             # anyone else needing to hit etcd to find out they can't
             # use it as well as to verify we can use it
             s.bind(('0.0.0.0', port))
             allocatedPort = etcd.create('console', node, port, {
                 'instance_uuid': self.uuid,
                 'port': port,
             })
             if allocatedPort:
                 return port
         except socket.error:
             LOG.with_field('instance', self.uuid).info(
                 'Collided with in use port %d, selecting another' % port)
             consumed.append(port)
         finally:
             s.close()
예제 #5
0
    def __iter__(self):
        for _, a in etcd.get_all('artifact', None):
            a = Artifact.from_db(a['uuid'])
            if not a:
                continue

            out = self.apply_filters(a)
            if out:
                yield out
예제 #6
0
    def __iter__(self):
        for _, b in etcd.get_all('blob', None):
            b = Blob.from_db(b['uuid'])
            if not b:
                continue

            out = self.apply_filters(b)
            if out:
                yield out
예제 #7
0
 def _db_get_attributes(self, attribute_prefix):
     if self.__in_memory_only:
         for key in self.__in_memory_values.keys():
             if key.startswith(attribute_prefix):
                 yield key, json.loads(self.__in_memory_values[key])
     else:
         for key, data in etcd.get_all('attribute/%s' % self.object_type,
                                       self.__uuid,
                                       prefix=attribute_prefix):
             yield key, data
예제 #8
0
def get_networks(all=False, namespace=None):
    for n in etcd.get_all('network', None):
        if n['uuid'] == 'floating':
            continue
        if not all:
            if n['state'] in ['deleted', 'error']:
                continue
        if namespace:
            if namespace not in [n['namespace'], 'system']:
                continue
        yield n
예제 #9
0
def get_instances(only_node=None, all=False, namespace=None):
    see_this_node()
    for i in etcd.get_all('instance', None):
        if only_node and i['node'] != only_node:
            continue
        if not all:
            if i['state'] in ['deleted', 'error']:
                continue
        if namespace:
            if namespace not in [i['namespace'], 'system']:
                continue
        yield i
예제 #10
0
def allocate_console_port(instance_uuid):
    node = config.parsed.get('NODE_NAME')
    with etcd.get_lock('console', None, node):
        consumed = []
        for value in etcd.get_all('console', node):
            consumed.append(value['port'])

        port = random.randint(30000, 50000)
        while port in consumed or not _port_free(port):
            port = random.randint(30000, 50000)

        etcd.put('console', node, port, {
            'instance_uuid': instance_uuid,
            'port': port,
        })
        return port
예제 #11
0
def get_instances(only_node=None, all=False, namespace=None):
    for i in etcd.get_all('instance', None):
        if only_node and i['node'] != only_node:
            continue
        if not all:
            if i['state'] in ['deleted', 'error']:
                continue
        if namespace:
            if namespace not in [i['namespace'], 'system']:
                continue

        if 'video' not in i:
            i['video'] = {'model': 'cirrus', 'memory': 16384}
        if 'error_message' not in i:
            i['error_message'] = None

        yield i
예제 #12
0
def allocate_console_port(instance_uuid):
    see_this_node()
    node = config.parsed.get('NODE_NAME')
    with etcd.get_lock('sf/console/%s' % node) as _:
        consumed = []
        for value in etcd.get_all('console', node):
            consumed.append(value['port'])

        port = random.randint(30000, 50000)
        while port in consumed:
            port = random.randint(30000, 50000)

        etcd.put('console', node, port, {
            'instance_uuid': instance_uuid,
            'port': port,
        })
        return port
예제 #13
0
def get_nodes():
    see_this_node()
    return etcd.get_all('node', None)
예제 #14
0
 def __iter__(self):
     for _, i in etcd.get_all('instance', None):
         out = self.apply_filters(Instance(i))
         if out:
             yield out
예제 #15
0
def get_nodes():
    return etcd.get_all('node', None)
예제 #16
0
def get_node_ips():
    for value in etcd.get_all('node', None):
        yield value['ip']
예제 #17
0
def list_namespaces():
    return etcd.get_all('namespace', None)
예제 #18
0
def get_node_ips():
    see_this_node()
    for value in etcd.get_all('node', None):
        yield value['ip']
예제 #19
0
def get_events(object_type, object_uuid):
    for m in etcd.get_all('event/%s' % object_type,
                          object_uuid,
                          sort_order='ascend'):
        yield m
예제 #20
0
def get_instance_snapshots(instance_uuid):
    for m in etcd.get_all('snapshot', instance_uuid, sort_order='ascend'):
        yield m
예제 #21
0
def get_instance_interfaces(instance_uuid):
    for ni in etcd.get_all('networkinterface', None):
        if ni['state'] == 'deleted':
            continue
        if ni['instance_uuid'] == instance_uuid:
            yield ni
예제 #22
0
 def filter(cls, filters):
     for _, o in etcd.get_all(cls.object_type, None):
         obj = cls(o)
         if all([f(obj) for f in filters]):
             yield obj
예제 #23
0
    def _cluster_wide_cleanup(self, last_loop_run):
        LOG.info('Running cluster maintenance')

        # Cleanup soft deleted objects
        for objtype in OBJECT_NAMES:
            for _, objdata in etcd.get_all(objtype, None):
                try:
                    obj = OBJECT_NAMES_TO_CLASSES[objtype].from_db(
                        objdata['uuid'])
                    if (obj.state.value == dbo.STATE_DELETED and
                            time.time() - obj.state.update_time > config.CLEANER_DELAY):
                        LOG.with_object(obj).info('Hard deleting')
                        obj.hard_delete()
                except exceptions.BadObjectVersion:
                    LOG.with_fields({
                        objtype: obj.uuid
                    }).warning('Could not load object for hard delete, bad version')

        # Prune artifacts which might have too many versions
        for a in artifact.Artifacts([]):
            a.delete_old_versions()

        # Inspect current state of blobs, the actual changes are done below outside
        # the read only cache. We define being low on disk has having less than three
        # times the minimum amount of disk. This is so we start to rearrange blobs
        # before scheduling starts to fail.
        overreplicated = {}
        underreplicated = []
        low_disk_nodes = nodes_by_free_disk_descending(
            minimum=0, maximum=(config.MINIMUM_FREE_DISK * 3),
            intention='blobs')

        absent_nodes = []
        for n in Nodes([node_inactive_states_filter]):
            LOG.with_fields({
                'node': n.fqdn}).info('Node is absent for blob replication')
            absent_nodes.append(n.fqdn)
        LOG.info('Found %d inactive nodes' % len(absent_nodes))

        current_fetches = defaultdict(list)
        for workname, workitem in etcd.get_outstanding_jobs():
            # A workname looks like: /sf/queue/sf-3/jobname
            _, _, phase, node, _ = workname.split('/')
            if node == 'networknode':
                continue

            for task in workitem:
                if isinstance(task, FetchBlobTask):
                    if node in absent_nodes:
                        LOG.with_fields({
                            'blob': task.blob_uuid,
                            'node': node,
                            'phase': phase
                        }).warning('Node is absent, ignoring fetch')
                    else:
                        LOG.with_fields({
                            'blob': task.blob_uuid,
                            'node': node,
                            'phase': phase
                        }).info('Node is fetching blob')
                        current_fetches[task.blob_uuid].append(node)

        with etcd.ThreadLocalReadOnlyCache():
            for b in blob.Blobs([active_states_filter]):
                # If there is current work for a blob, we ignore it until that
                # work completes
                if b.uuid in current_fetches:
                    LOG.with_fields({
                        'blob': task.blob_uuid
                    }).info('Blob has current fetches, ignoring')
                    continue

                locations = b.locations
                ignored_locations = []
                for n in absent_nodes:
                    if n in locations:
                        locations.remove(n)
                        ignored_locations.append(n)

                if ignored_locations:
                    LOG.with_fields({
                        'blob': b,
                        'ignored_locations': ignored_locations
                    }).info('Ignored some blob locations as nodes are absent')

                delta = len(locations) - config.BLOB_REPLICATION_FACTOR
                if delta > 0:
                    # So... The blob replication factor is a target not a limit.
                    # Specifically, if there are more locations than the target
                    # but we aren't low on disk, we don't clean them up. That's
                    # because its hard for us to predict which machine will run
                    # out of disk first, and copying a blob back to a machine if
                    # its needed there is slow and annoying.

                    # Work out where the blob is in active use.
                    excess_locations = b.locations
                    in_use_locations = []

                    for instance_uuid in b.instances:
                        i = instance.Instance.from_db(instance_uuid)
                        node = i.placement.get('node')
                        if node in excess_locations:
                            excess_locations.remove(node)
                            in_use_locations.append(node)

                    # Only remove excess copies from nodes which are running
                    # low on disk. Do not end up with too few replicas.
                    overreplicated[b.uuid] = []
                    target = (config.BLOB_REPLICATION_FACTOR -
                              len(in_use_locations))
                    for n in low_disk_nodes:
                        if n in excess_locations:
                            overreplicated[b.uuid].append(n)
                        if len(overreplicated[b.uuid]) == target:
                            break

                elif delta < 0:
                    # The tuple is blob UUID, and how much to over replicate by.
                    underreplicated.append((b.uuid, 0))

                else:
                    # We have exactly the right number of copies, but what if
                    # the blob is on a really full node?
                    for n in low_disk_nodes:
                        if n in b.locations:
                            # We have at least one space constrained node with
                            # this blob. Request an extra temporary copy of the
                            # blob elsewhere so we can hopefully clean up one of
                            # these next pass. The tuple is blob UUID, and how
                            # much to over replicate by.
                            underreplicated.append((b.uuid, 1))
                            break

        # Prune over replicated blobs
        for blob_uuid in overreplicated:
            b = blob.Blob.from_db(blob_uuid)
            for node in overreplicated[blob_uuid]:
                LOG.with_fields({
                    'blob': b,
                    'node': node
                }).info('Blob over replicated, removing from node with no users')
                b.drop_node_location(node)

        # Replicate under replicated blobs, but only if we don't have heaps of
        # queued replications already
        for blob_uuid, excess in underreplicated:
            LOG.with_fields({
                'current': len(current_fetches),
                'maximum': config.MAX_CONCURRENT_BLOB_TRANSFERS
            }).info('Concurrent blob transfers')
            if len(current_fetches) > config.MAX_CONCURRENT_BLOB_TRANSFERS:
                LOG.info(
                    'Too many concurrent blob transfers queued, not queueing more')
                break

            b = blob.Blob.from_db(blob_uuid)
            LOG.with_fields({
                'blob': b
            }).info('Blob under replicated, attempting to correct')
            b.request_replication(allow_excess=excess)
            current_fetches[blob_uuid].append('unknown')

        # Node management
        for n in Nodes([node_inactive_states_filter]):
            age = time.time() - n.last_seen

            # Find nodes which have returned from being missing
            if age < config.NODE_CHECKIN_MAXIMUM:
                n.state = Node.STATE_CREATED
                LOG.with_object(n).info('Node returned from being missing')

            # Find nodes which have been offline for a long time, unless
            # this machine has been asleep for a long time (think developer
            # laptop).
            if (time.time() - last_loop_run < config.NODE_CHECKIN_MAXIMUM
                    and age > config.NODE_CHECKIN_MAXIMUM * 10):
                n.state = Node.STATE_ERROR
                for i in instance.healthy_instances_on_node(n):
                    LOG.with_object(i).with_object(n).info(
                        'Node in error state, erroring instance')
                    # Note, this queue job is just in case the node comes
                    # back.
                    i.enqueue_delete_due_error('Node in error state')

        # Find nodes which haven't checked in recently
        for n in Nodes([node_active_states_filter]):
            age = time.time() - n.last_seen
            if age > config.NODE_CHECKIN_MAXIMUM:
                n.state = Node.STATE_MISSING

        # And we're done
        LOG.info('Cluster maintenance loop complete')
예제 #24
0
def list_namespaces():
    for _, value in etcd.get_all('namespace', None):
        yield value
예제 #25
0
 def __iter__(self):
     for _, ni in etcd.get_all('networkinterface', None):
         out = self.apply_filters(NetworkInterface(ni))
         if out:
             yield out
예제 #26
0
def get_stale_instances(delay):
    for i in etcd.get_all('instance', None):
        if i['state'] in ['deleted', 'error']:
            if time.time() - i['state_updated'] > delay:
                yield i
예제 #27
0
def get_stale_network_interfaces(delay):
    for n in etcd.get_all('networkinterface', None):
        if n['state'] in ['deleted', 'error']:
            if time.time() - n['state_updated'] > delay:
                yield n
예제 #28
0
def get_stale_networks(delay):
    see_this_node()
    for n in etcd.get_all('network', None):
        if n['state'] in ['deleted', 'error']:
            if time.time() - n['state_updated'] > delay:
                yield n