Exemplo n.º 1
0
class ScaleManager(object):

    def __init__(self, zk_servers, names=[]):

        self.names      = names
        self.zk_servers = zk_servers
        self.zk_conn    = None
        self.running    = False
        self.config     = ManagerConfig("")
        self.cond       = threading.Condition()
        self.uuid       = str(uuid.uuid4()) # Manager uuid (generated).

        self.endpoints = {}        # Endpoint map (name -> endpoint)
        self.key_to_endpoints = {} # Endpoint map (key() -> [names...])
        self.confirmed = {}        # Endpoint map (name -> confirmed IPs)

        self.managers = {}        # Forward map of manager keys.
        self.manager_ips = []     # List of all manager IPs.
        self.registered_ips = []  # List of registered IPs.
        self.manager_keys = []    # Our local manager keys.
        self.key_to_manager = {}  # Reverse map for manager keys.
        self.key_to_owned = {}    # Endpoint to ownership.

        self.load_balancer = None # Load balancer connections.

        # Setup logging.
        self.log = self.setup_logging()

        # The Windows domain connection.
        self.windows = windows.WindowsConnection()

        # The reactor domain.
        self.domain = ""

    @locked
    def __connect_to_zookeeper(self):
        # Create a Zookeeper connection.
        if self.zk_conn:
            self.zk_conn.close()
        self.zk_conn = ZookeeperConnection(self.zk_servers)

    @locked
    def setup_logging(self):
        """ Add an in-memory log that can be polled remotely. """
        log_buffer = StringIO()
        logger = logging.getLogger()
        formatter = logging.Formatter('%(asctime)s [%(thread)d] %(levelname)s %(name)s: %(message)s')
        handler = logging.StreamHandler(log_buffer)
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        return log_buffer

    def serve(self):
        self.__connect_to_zookeeper()

        # Load our configuration and register ourselves.
        self.manager_register()

        # Watch all managers and endpoints.
        self.manager_change(self.zk_conn.watch_children(paths.managers(), self.manager_change))
        self.endpoint_change(self.zk_conn.watch_children(paths.endpoints(), self.endpoint_change))

        # Watch all IPs.
        self.register_ip(self.zk_conn.watch_children(paths.new_ips(), self.register_ip))
        self.unregister_ip(self.zk_conn.watch_children(paths.drop_ips(), self.unregister_ip))

    @locked
    def manager_select(self, endpoint):
        # Remember whether this was previous managed.
        managed = self.endpoint_owned(endpoint)

        # Find the closest key.
        keys = self.key_to_manager.keys()
        if len(keys) == 0:
            logging.error("No scale manager available!")
            manager_key = None
        else:
            keys.sort()
            index = bisect.bisect(keys, endpoint.key())
            key = keys[index % len(self.key_to_manager)]
            manager_key = self.key_to_manager[key]

        # Check if this is us.
        self.key_to_owned[endpoint.key()] = (manager_key == self.uuid)

        logging.info("Endpoint %s owned by %s (%s)." % \
            (endpoint.name, manager_key, \
            self.endpoint_owned(endpoint) and "That's me!" or "Not me!"))

        # Check if it is one of our own.
        # Start the endpoint if necessary (now owned).
        if self.endpoint_owned(endpoint):
            self.zk_conn.write(paths.endpoint_manager(endpoint.name), self.uuid, ephemeral=True)
            if not(managed):
                self.start_endpoint(endpoint)

    @locked
    def manager_remove(self, endpoint):
        if self.key_to_owned.has_key(endpoint.key()):
            del self.key_to_owned[endpoint.key()]

    @locked
    def endpoint_owned(self, endpoint):
        return self.key_to_owned.get(endpoint.key(), False)

    def endpoint_change(self, endpoints):
        logging.info("Endpoints have changed: new=%s, existing=%s" %
                     (endpoints, self.endpoints.keys()))

        for endpoint_name in endpoints:
            if endpoint_name not in self.endpoints:
                self.create_endpoint(endpoint_name)

        known_endpoint_names = self.endpoints.keys()
        for endpoint_name in known_endpoint_names:
            if endpoint_name not in endpoints:
                self.remove_endpoint(endpoint_name, unmanage=True)

    def update_config(self, config_str):
        self.manager_register(config_str)
        self.reload_loadbalancer()

    def __configure(self, config_str):
        """
        This setups up the base manager configuration by combining
        the global configuration and config_str into a single configuration.
        """
        self.zk_conn.clear_watch_fn(self.update_config)
        global_config = self.zk_conn.watch_contents(paths.config(), self.update_config)
        # Load our given configuration.
        base_config = ManagerConfig(config_str)

        if global_config:
            base_config.reload(global_config)

        # NOTE: We may have multiple global IPs (especially in the case of
        # provisioning a cluster that could have floating IPs that move around.
        # We read in each of the configuration blocks in turn, and hope that
        # they are not somehow mutually incompatible.
        configured_ips = None
        if self.names:

            def load_ip_config(ip):
                # Reload our local config.
                local_config = self.zk_conn.watch_contents(
                                    paths.manager_config(ip),
                                    self.update_config)
                if local_config:
                    base_config.reload(local_config)

            # Read all configured IPs.
            for ip in self.names:
                load_ip_config(ip)
            configured_ips = base_config.ips()
            for ip in configured_ips:
                load_ip_config(ip)

        return base_config

    @locked
    def __register_manager_ips(self, configured_ips):
        """
        Register all of the manager's configured ips.
        """
        # We remove all existing registered IPs.
        for ip in self.registered_ips:
            if self.zk_conn.read(paths.manager_ip(ip)) == self.uuid:
                logging.info("Clearing IP '%s'." % ip)
                self.zk_conn.delete(paths.manager_ip(ip))
        self.registered_ips = []

        if self.names:
            def register_ip(name):
                try:
                    # Register our IP.
                    ip = socket.gethostbyname(name)
                    self.zk_conn.write(paths.manager_ip(ip), self.uuid, ephemeral=True)
                    logging.info("Registered IP '%s'." % ip)
                    self.registered_ips.append(ip)
                except socket.error:
                    logging.error("Skipping registration of '%s'." % name)

            # Register configured IPs if available.
            if configured_ips:
                for ip in configured_ips:
                    register_ip(ip)
            else:
                for name in self.names:
                    register_ip(name)

    @locked
    def __determine_manager_keys(self, key_num):
        # Generate keys.
        while len(self.manager_keys) < key_num:
            # Generate a random hash key to associate with this manager.
            self.manager_keys.append(hashlib.md5(str(uuid.uuid4())).hexdigest())
        while len(self.manager_keys) > key_num:
            # Drop keys while we're too high.
            del self.manager_keys[len(self.manager_keys) - 1]

        # Write out our associated hash keys as an ephemeral node.
        key_string = ",".join(self.manager_keys)
        self.zk_conn.write(paths.manager_keys(self.uuid), key_string, ephemeral=True)
        logging.info("Generated %d keys." % len(self.manager_keys))

    @locked
    def __select_endpoints(self):
        # If we're not doing initial setup, refresh endpoints.
        for endpoint in self.endpoints.values():
            self.manager_select(endpoint)

    @locked
    def __setup_loadbalancer_connections(self, loadbalancer_names):
        # Create the loadbalancer connections.
        # NOTE: Any old loadbalancer object should be cleaned
        # up automatically (i.e. the objects should implement
        # fairly sensible __del__ methods when necessary).
        self.load_balancer = lb_connection.LoadBalancers()
        for name in self.config.loadbalancer_names():
            self.load_balancer.append(\
                lb_connection.get_connection(\
                    name, self.config.loadbalancer_config(name), self))

    @locked
    def __set_config(self, config):
        self.config = config

    def manager_register(self, config_str=''):
        # Figure out our global IPs.
        logging.info("Manager %s has key %s." % (str(self.names), self.uuid))

        manager_config = self.__configure(config_str)
        self.__register_manager_ips(manager_config.ips())
        self.__determine_manager_keys(manager_config.keys())

        self.__set_config(manager_config)
        self.__select_endpoints()

        self.__setup_loadbalancer_connections(manager_config.loadbalancer_names())

        # Reload the domain.
        self.reload_domain(self.zk_conn.watch_contents(\
                                paths.domain(),
                                self.reload_domain,
                                default_value=self.domain))

    @locked
    def reload_domain(self, domain):
        self.domain = domain or ""
        self.reload_loadbalancer()

    @locked
    def manager_change(self, managers):
        for manager in managers:
            if manager not in self.managers:
                # Read the key and update all mappings.
                keys = self.zk_conn.read(paths.manager_keys(manager)).split(",")
                logging.info("Found manager %s with %d keys." % (manager, len(keys)))
                self.managers[manager] = keys
                for key in keys:
                    self.key_to_manager[key] = manager

        managers_to_remove = []
        for manager in self.managers:
            if manager not in managers:
                # Remove all mappings.
                keys = self.managers[manager]
                logging.info("Removing manager %s with %d keys." % (manager, len(keys)))
                for key in keys:
                    if key in self.key_to_manager:
                        del self.key_to_manager[key]
                managers_to_remove.append(manager)
        for manager in managers_to_remove:
            if manager in self.managers:
                del self.managers[manager]

        # Recompute all endpoint owners.
        for endpoint in self.endpoints.values():
            self.manager_select(endpoint)

        # Reload all managers IPs.
        self.manager_ips = \
            map(lambda x: BackendIP(x),
                self.zk_conn.list_children(paths.manager_ips()))

        # Kick the loadbalancer.
        self.reload_loadbalancer()

    def create_endpoint(self, endpoint_name):
        logging.info("New endpoint %s found to be managed." % endpoint_name)

        # Create the object.
        # NOTE: We create all endpoints on this manager with the current
        # manager config. This means that all manager keys will be inherited
        # and you can set some sensible defaults either in the local manager
        # configuration or in the global configuration. 
        # This does mean however, that the ManagerConfig and EndpointConfig
        # should have disjoint sections for the most part.
        endpoint = Endpoint(endpoint_name, str(self.config), self)
        self.add_endpoint(endpoint)

    @locked
    def __add_endpoint(self, endpoint):
        self.endpoints[endpoint.name] = endpoint
        endpoint_key = endpoint.key()

        if not(self.key_to_endpoints.has_key(endpoint_key)):
                self.key_to_endpoints[endpoint_key] = []

        if not(endpoint.name in self.key_to_endpoints[endpoint_key]):
            self.key_to_endpoints[endpoint_key].append(endpoint.name)

    def add_endpoint(self, endpoint):

        self.__add_endpoint(endpoint)

        def local_lock(fn):
            def wrapped_fn(*args, **kwargs):
                try:
                    self.cond.acquire()
                    return fn(*args, **kwargs)
                finally:
                    self.cond.release()
            return wrapped_fn

        @local_lock
        def update_state(value):
            endpoint.update_state(value)
            if self.endpoint_owned(endpoint):
                endpoint.update()

        @local_lock
        def update_config(value):
            endpoint.update_config(value)
            if self.endpoint_owned(endpoint):
                endpoint.update()

        @local_lock
        def update_confirmed(ips):
            if ips:
                self.confirmed[endpoint.name] = ips
            elif endpoint.name in self.confirmed:
                del self.confirmed[endpoint.name]

            # Kick off a loadbalancer update.
            self.update_loadbalancer(endpoint)

        # Watch the config for this endpoint.
        logging.info("Watching endpoint %s." % (endpoint.name))
        update_state(
            self.zk_conn.watch_contents(paths.endpoint_state(endpoint.name),
                                        update_state, '',
                                        clean=True))
        update_config(
            self.zk_conn.watch_contents(paths.endpoint(endpoint.name),
                                        update_config, '',
                                        clean=True))

        # Select the manager for this endpoint.
        self.manager_select(endpoint)

        # Update the loadbalancer for this endpoint.
        update_confirmed(
            self.zk_conn.watch_children(paths.confirmed_ips(endpoint.name),
                                        update_confirmed,
                                        clean=True))

    @locked
    def start_endpoint(self, endpoint):
        # This endpoint is now being managed by us.
        endpoint.manage()
        endpoint.update()

    @locked
    def __remove_endpoint(self, endpoint, unmanage):

        endpoint_name = endpoint.name
        logging.info("Unmanaging endpoint %s" % (endpoint_name))
        endpoint_names = self.key_to_endpoints.get(endpoint.key(), [])
        if endpoint_name in endpoint_names:
            endpoint_names.remove(endpoint_name)
            if len(endpoint_names) == 0:
                del self.key_to_endpoints[endpoint.key()]

        # Perform a full unmanage if this is required.
        if unmanage and self.endpoint_owned(endpoint):
            endpoint.unmanage()

        del self.endpoints[endpoint_name]

    def remove_endpoint(self, endpoint_name, unmanage=False):
        """
        This removes / unmanages the endpoint.
        """
        logging.info("Removing endpoint %s from manager %s" % (endpoint_name, self.uuid))
        endpoint = self.endpoints.get(endpoint_name, None)

        if endpoint:
            self.zk_conn.clear_watch_path(paths.endpoint_state(endpoint.name))
            self.zk_conn.clear_watch_path(paths.endpoint(endpoint.name))
            self.zk_conn.clear_watch_path(paths.confirmed_ips(endpoint.name))

            # Update the loadbalancer for this endpoint.
            self.update_loadbalancer(endpoint, remove=True)
            self.__remove_endpoint(endpoint, unmanage)
            self.manager_remove(endpoint)

    @locked
    def confirmed_ips(self, endpoint_name):
        """
        Returns a list of all the confirmed ips for the endpoint.
        """
        return self.confirmed.get(endpoint_name, [])

    @locked
    def active_ips(self, endpoint_name):
        """
        Returns all confirmed and static ips for the endpoint.
        """
        ips = self.confirmed.get(endpoint_name, [])
        if endpoint_name in self.endpoints:
            ips += self.endpoints[endpoint_name].static_addresses()

        # Make sure that we return a unique set.
        return list(set(ips))

    @locked
    def drop_ip(self, endpoint_name, ip):
        logging.info("Dropping endpoint %s IP %s" % (endpoint_name, ip))
        self.zk_conn.delete(paths.endpoint_ip_metrics(endpoint_name, ip))
        self.zk_conn.delete(paths.confirmed_ip(endpoint_name, ip))
        self.zk_conn.delete(paths.ip_address(ip))
        for name in self.config.loadbalancer_names():
            self.zk_conn.delete(paths.loadbalancer_ip(name, ip))

    @locked
    def confirm_ip(self, endpoint_name, ip):
        logging.info("Adding endpoint %s IP %s" % (endpoint_name, ip))
        self.zk_conn.write(paths.confirmed_ip(endpoint_name, ip), "")
        self.zk_conn.write(paths.ip_address(ip), endpoint_name)

    @locked
    def update_ips(self, ips, add=True):
        if len(ips) == 0:
            return

        ip_map = {}

        for endpoint in self.endpoints.values():
            endpoint_ips = endpoint.addresses()
            endpoint_ips.extend(endpoint.static_addresses())
            for ip in endpoint_ips:
                ip_map[ip] = endpoint

        for ip in ips:
            endpoint = ip_map.get(ip, None)

            if not(endpoint):
                continue

            if add:
                self.confirm_ip(endpoint.name, ip)
            else:
                self.drop_ip(endpoint.name, ip)

    @locked
    def register_ip(self, ips):
        self.update_ips(ips, add=True)
        for ip in ips:
            self.zk_conn.delete(paths.new_ip(ip))

    @locked
    def unregister_ip(self, ips):
        self.update_ips(ips, add=False)
        for ip in ips:
            self.zk_conn.delete(paths.drop_ip(ip))

    @locked
    def collect_endpoint(self, endpoint, public_ips, private_ips, redirects):
        # Collect all availble IPs.
        for ip in self.active_ips(endpoint.name):
            ip = BackendIP(ip, endpoint.port(), endpoint.weight())
            if endpoint.public():
                public_ips.append(ip)
            else:
                private_ips.append(ip)

        # Collect all available redirects.
        redirect = endpoint.redirect()
        if redirect:
            redirects.append(redirect)

    @locked
    def collect_update_loadbalancer(self, url, names,
                                    public_ips, private_ips, redirects):

        if len(public_ips) > 0 or \
           len(private_ips) > 0 or \
           len(redirects) == 0:
            self.load_balancer.change(url,
                                      names,
                                      public_ips,
                                      self.manager_ips,
                                      private_ips)
        else:
            self.load_balancer.redirect(url,
                                        names,
                                        redirects[0],
                                        self.manager_ips)

    @locked
    def update_loadbalancer(self, endpoint, remove=False):
        public_ips = []
        private_ips = []
        redirects = []
        names = []

        # Go through all endpoints with the same keys.
        for endpoint_name in self.key_to_endpoints.get(endpoint.key(), []):
            if remove and (self.endpoints[endpoint_name] == endpoint):
                continue
            else:
                names.append(endpoint_name)
                self.collect_endpoint(
                    self.endpoints[endpoint_name],
                    public_ips, private_ips, redirects)

        self.collect_update_loadbalancer(endpoint.url(), names,
                                         public_ips, private_ips, redirects)
        self.load_balancer.save()

    @locked
    def reload_loadbalancer(self):
        self.load_balancer.clear()

        for (key, endpoint_names) in self.key_to_endpoints.items():
            public_ips = []
            private_ips = []
            names = []
            redirects = []

            for endpoint in map(lambda x: self.endpoints[x], endpoint_names):
                names.append(endpoint.name)
                self.collect_endpoint(endpoint, public_ips, private_ips, redirects)
            self.collect_update_loadbalancer(endpoint.url(), names,
                                             public_ips, private_ips, redirects)

        self.load_balancer.save()

    @locked
    def start_params(self, endpoint=None):
        params = {}
        # If a Windows connection is available, get start params for this service.
        # This will generally create the appropriate accounts on the Windows domain,
        # and give them back to the VMs for the agent to use in configuration.
        if endpoint and self.windows:
            params.update(self.windows.start_params(ConfigView(endpoint.config, "windows")))
        return params

    @locked
    def cleanup_start_params(self, endpoint, start_params):
        # We've failed to launch a machine, so clean up any work we've done
        if self.windows:
            self.windows.cleanup_start_params(ConfigView(endpoint.config, "windows"),
                                              start_params)

    @locked
    def marked_instances(self, endpoint_name):
        """ Return a list of all the marked instances. """
        marked_instances = self.zk_conn.list_children(paths.marked_instances(endpoint_name))
        if marked_instances == None:
            marked_instances = []
        return marked_instances

    @locked
    def mark_instance(self, endpoint_name, instance_id, label):
        # Increment the mark counter.
        remove_instance = False
        mark_counters = \
                self.zk_conn.read(paths.marked_instance(endpoint_name, instance_id), '{}')
        mark_counters = json.loads(mark_counters)
        mark_counter = mark_counters.get(label, 0)
        mark_counter += 1

        if mark_counter >= self.config.mark_maximum(label):
            # This instance has been marked too many times. There is likely
            # something really wrong with it, so we'll clean it up.
            logging.warning("Instance %s for endpoint %s has been marked too many times and"
                         " will be removed. (count=%s)" % (instance_id, endpoint_name, mark_counter))
            remove_instance = True
            self.zk_conn.delete(paths.marked_instance(endpoint_name, instance_id))

        else:
            # Just save the mark counter.
            logging.info("Instance %s for endpoint %s has been marked (count=%s)" %
                         (instance_id, endpoint_name, mark_counter))
            mark_counters[label] = mark_counter
            self.zk_conn.write(paths.marked_instance(endpoint_name, instance_id),
                               json.dumps(mark_counters))

        return remove_instance

    @locked
    def drop_marked_instance(self, endpoint_name, instance_id):
        """ Delete the marked instance data. """
        self.zk_conn.delete(paths.marked_instance(endpoint_name, instance_id))

    @locked
    def decommission_instance(self, endpoint_name, instance_id, ip_addresses):
        """ Mark the instance id as being decommissioned. """
        self.zk_conn.write(paths.decommissioned_instance(endpoint_name, instance_id),
                           json.dumps(ip_addresses))

    @locked
    def recommission_instance(self, endpoint_name, instance_id):
        """ Mark the instance id as being recommissioned. """
        # Delete decommissioned instance path and marked data
        self.zk_conn.delete(paths.decommissioned_instance(endpoint_name,
                            instance_id))
        self.drop_marked_instance(endpoint_name, instance_id)

    @locked
    def decommissioned_instances(self, endpoint_name):
        """ Return a list of all the decommissioned instances. """
        decommissioned_instances = self.zk_conn.list_children(\
            paths.decommissioned_instances(endpoint_name))
        if decommissioned_instances == None:
            decommissioned_instances = []
        return decommissioned_instances

    @locked
    def decommissioned_instance_ip_addresses(self, endpoint_name, instance_id):
        """ Return the ip address of a decommissioned instance. """
        ip_addresses = self.zk_conn.read(paths.decommissioned_instance(endpoint_name, instance_id))
        if ip_addresses != None:
            ip_addresses = json.loads(ip_addresses)
            if type(ip_addresses) == str:
                ip_addresses = [ip_addresses]
        else:
            ip_addresses = []
        return ip_addresses

    @locked
    def drop_decommissioned_instance(self, endpoint_name, instance_id,
                                     instance_name=None):
        """ Delete the decommissioned instance """
        ip_addresses = self.decommissioned_instance_ip_addresses(endpoint_name, instance_id)
        for ip_address in ip_addresses:
            self.drop_ip(endpoint_name, ip_address)
        self.zk_conn.delete(paths.decommissioned_instance(endpoint_name, instance_id))
        # For windows machines we also do some cleanup
        if instance_name and self.windows and endpoint_name in self.endpoints:
            endpoint = self.endpoints[endpoint_name]
            self.windows.cleanup(ConfigView(endpoint.config, "windows"),
                                 instance_name)

    def metric_indicates_active(self, metrics):
        """ Returns true if the metrics indicate that there are active connections. """
        active_metrics = metrics.get("active", (0, 0))
        try:
            return active_metrics[1] > 0
        except:
            # The active metric is defined but as a bad form.
            logging.warning("Malformed metrics found: %s" % (active_metrics))
            return False

    @locked
    def update_metrics(self):
        """ 
        Collects the metrics from the loadbalancer, updates zookeeper and then collects
        the metrics posted by other managers.
        
        returns a tuple (metrics, active_connections) both of which are dictionaries. Metrics
        is indexed by the endpoint key and active connections is indexed by endpoint name
        """

        # Update all the endpoint metrics from the loadbalancer.
        metrics = self.load_balancer.metrics()
        logging.debug("Load balancer returned metrics: %s" % metrics)

        # The metrics_by_key dictionary maps to a tuple (active, metrics).
        # The active value is a list of all IPs used to generate the metrics.
        # That is to say, if one or more of the value was used in to generate
        # the aggregated metrics that corresponds to that IP then it will be
        # present in the active set.
        metrics_by_key = {}
        active_connections = {}

        for ip in metrics:
            for endpoint in self.endpoints.values():

                if not(endpoint.key() in metrics_by_key):
                    metrics_by_key[endpoint.key()] = ([], [])
                if not(endpoint.name in active_connections):
                    active_connections[endpoint.name] = []

                endpoint_ips = self.active_ips(endpoint.name)
                if ip in endpoint_ips:
                    metrics_by_key[endpoint.key()][0].append(ip)
                    metrics_by_key[endpoint.key()][1].append(metrics[ip])

                    if self.metric_indicates_active(metrics[ip]):
                        active_connections[endpoint.name].append(ip)

        # Stuff all the metrics into Zookeeper.
        self.zk_conn.write(paths.manager_metrics(self.uuid), \
                           json.dumps(metrics_by_key), \
                           ephemeral=True)
        self.zk_conn.write(paths.manager_active_connections(self.uuid), \
                           json.dumps(active_connections), \
                           ephemeral=True)

        # Load all metrics (from other managers).
        all_metrics = {}

        # A listing of all the active connections.
        all_active_connections = {}

        # Read the keys for all other managers.
        for manager in self.managers:
            # Skip re-reading the local metrics.
            if manager == self.uuid:
                manager_metrics = metrics_by_key
                manager_active_connections = active_connections
            else:
                manager_metrics = self.zk_conn.read(paths.manager_metrics(manager), "{}")
                manager_metrics = json.loads(manager_metrics)
                manager_active_connections = \
                        self.zk_conn.read(paths.manager_active_connections(manager), "{}")
                manager_active_connections = json.loads(manager_active_connections)

            # Merge into the all_metrics dictionary.
            for key in manager_metrics:
                if not(key in all_metrics):
                    all_metrics[key] = ([], [])

                all_metrics[key][0].extend(manager_metrics[key][0])
                all_metrics[key][1].extend(manager_metrics[key][1])

            # Merge all the active connection counts.
            for endpoint_name in manager_active_connections:
                if not(endpoint_name in all_active_connections):
                    all_active_connections[endpoint_name] = []

                all_active_connections[endpoint_name].extend(\
                        manager_active_connections[endpoint_name])

        # Return all available global metrics.
        return (all_metrics, all_active_connections)

    @locked
    def load_metrics(self, endpoint, endpoint_metrics={}):
        """ 
        Load the particular metrics for a endpoint and return
        a tuple (metrics, active_connections) where metrics
        are the metrics to use for the endpoint and active_connections
        is a list of ip addresses with active connections.
        """

        # Read any default metrics. We can override the source endpoint for
        # metrics here (so, for example, a backend database server can inheret
        # a set of metrics given for the front server).  This, like many other
        # things, is specified here by the name of the endpoint we are
        # inheriting metrics for. If not given, we default to the current
        # endpoint.
        source_key = endpoint.source_key()
        if source_key:
            (metric_ips, metrics) = endpoint_metrics.get(source_key, ([], []))
        else:
            (metric_ips, metrics) = endpoint_metrics.get(endpoint.key(), ([], []))

        default_metrics = self.zk_conn.read(paths.endpoint_custom_metrics(endpoint.name))
        if default_metrics:
            try:
                # This should be a dictionary { "name" : (weight, value) }
                metrics.append(json.loads(default_metrics))
            except ValueError:
                logging.warn("Invalid custom metrics for %s." % (endpoint.name))

        # Read other metrics for given hosts.
        active_connections = []
        for ip_address in self.active_ips(endpoint.name):
            ip_metrics = self.zk_conn.read(paths.endpoint_ip_metrics(endpoint.name, ip_address))
            if ip_metrics:
                try:
                    # This should be a dictionary { "name" : (weight, value) }
                    ip_metrics = json.loads(ip_metrics)
                    metrics.append(ip_metrics)
                    if not ip_address in metric_ips:
                        metric_ips.append(ip_address)
                    if self.metric_indicates_active(ip_metrics):
                        active_connections.append(ip_address)
                except ValueError:
                    logging.warn("Invalid instance metrics for %s:%s." % \
                                 (endpoint.name, ip_address))

        for instance_id in self.decommissioned_instances(endpoint.name):
            # Also check the metrics of decommissioned instances looking for any active counts.
            for ip_address in self.decommissioned_instance_ip_addresses(endpoint.name, instance_id):
                if ip_address:
                    ip_metrics = self.zk_conn.read(paths.endpoint_ip_metrics(endpoint.name, ip_address))
                    if ip_metrics:
                        try:
                            # As above, this should be a dictionary.
                            ip_metrics = json.loads(ip_metrics)
                            metrics.append(ip_metrics)
                            if not ip_address in metric_ips:
                                metric_ips.append(ip_address)
                            if self.metric_indicates_active(ip_metrics):
                                active_connections.append(ip_address)
                        except ValueError:
                            logging.warn("Invalid instance metrics for %s:%s." % \
                                         (endpoint.name, ip_address))

        # Return the metrics.
        return metrics, list(set(metric_ips)), active_connections

    @locked
    def health_check(self):
        # Save and load the current metrics.
        endpoint_metrics, active_connections = self.update_metrics()

        # Does a health check on all the endpoints that are being managed.
        for endpoint in self.endpoints.values():
            # Do not kick the endpoint if it is not currently owned by us.
            if not(self.endpoint_owned(endpoint)):
                continue

            try:
                metrics, metric_ips, endpoint_connections = \
                    self.load_metrics(endpoint, endpoint_metrics)

                # Compute the active set (including custom metrics, etc.).
                active = active_connections.get(endpoint.name, [])
                active.extend(endpoint_connections)
                active = list(set(active))

                # Compute the globally weighted averages.
                metrics = calculate_weighted_averages(metrics)

                # Update the live metrics and connections.
                logging.debug("Metrics for endpoint %s from %s: %s" % \
                              (endpoint.name, str(metric_ips), metrics))
                self.zk_conn.write(paths.endpoint_live_metrics(endpoint.name), \
                                   json.dumps(metrics), \
                                   ephemeral=True)
                self.zk_conn.write(paths.endpoint_live_active(endpoint.name), \
                                   json.dumps(active), \
                                   ephemeral=True)

                # Run a health check on this endpoint.
                endpoint.health_check(active)

                # Do the endpoint update.
                endpoint.update(reconfigure=False, metrics=metrics,
                                metric_instances=len(metric_ips),
                                active_ips=active)
            except:
                error = traceback.format_exc()
                logging.error("Error updating endpoint %s: %s" % (endpoint.name, error))

        try:
            # Try updating our logs.
            self.zk_conn.write(paths.manager_log(self.uuid), self.log.getvalue(), ephemeral=True)
        except:
            error = traceback.format_exc()
            logging.error("Error saving logs: %s" % error)

        # Reset the buffer.
        self.log.truncate(0)

    def run(self):
        # Note that we are running.
        self.running = True

        while self.running:
            try:
                # Reconnect to the Zookeeper servers.
                self.serve()

                # Perform continuous health checks.
                while self.running:
                    self.health_check()
                    if self.running:
                        time.sleep(self.config.health_check())

            except ZookeeperException:
                # Sleep on ZooKeeper exception and retry.
                error = traceback.format_exc()
                logging.debug("Received ZooKeeper exception, retrying: %s" % (error))
                if self.running:
                    time.sleep(self.config.health_check())

    def clean_stop(self):
        self.running = False
Exemplo n.º 2
0
class ReactorClient(object):

    def __init__(self, zk_servers):
        self.zk_conn = ZookeeperConnection(zk_servers)

    def __del__(self):
        self.close()

    def close(self):
        self.zk_conn.close()

    def list_managed_endpoints(self):
        return self.zk_conn.list_children(paths.endpoints())

    def get_managers_active(self, full=False):
        ips = self.zk_conn.list_children(paths.manager_ips())
        if full:
            managers = {}
            for ip in ips:
                managers[ip] = self.get_manager_key(ip)
            return managers
        else:
            return ips

    def get_manager_key(self, manager):
        return self.zk_conn.read(paths.manager_ip(manager))

    def get_manager_log(self, manager):
        return self.zk_conn.read(paths.manager_log(manager))

    def list_managers_configured(self):
        return self.zk_conn.list_children(paths.manager_configs())

    def manage_endpoint(self, endpoint_name, config):
        self.zk_conn.write(paths.endpoint(endpoint_name), config)

    def unmanage_endpoint(self, endpoint_name):
        self.zk_conn.delete(paths.endpoint(endpoint_name))

    def update_endpoint(self, endpoint_name, config):
        self.zk_conn.write(paths.endpoint(endpoint_name), config)

    def set_endpoint_metrics(self, endpoint_name, metrics, endpoint_ip=None):
        if endpoint_ip:
            self.zk_conn.write(
                paths.endpoint_ip_metrics(endpoint_name, endpoint_ip),
                json.dumps(metrics))
        else:
            self.zk_conn.write(
                paths.endpoint_custom_metrics(endpoint_name),
                json.dumps(metrics))

    def get_endpoint_metrics(self, endpoint_name):
        blob = self.zk_conn.read(paths.endpoint_live_metrics(endpoint_name))
        if blob:
            return json.loads(blob)
        else:
            blob = self.zk_conn.read(paths.endpoint_custom_metrics(endpoint_name))
            if blob:
                return json.loads(blob)
            else:
                return blob

    def get_endpoint_active(self, endpoint_name):
        blob = self.zk_conn.read(paths.endpoint_live_active(endpoint_name))
        if blob:
            return json.loads(blob)
        else:
            return blob

    def set_endpoint_state(self, endpoint_name, state):
        self.zk_conn.write(paths.endpoint_state(endpoint_name), state)

    def get_endpoint_state(self, endpoint_name):
        return self.zk_conn.read(paths.endpoint_state(endpoint_name))

    def get_endpoint_manager(self, endpoint_name):
        return self.zk_conn.read(paths.endpoint_manager(endpoint_name))

    def get_endpoint_config(self, endpoint_name):
        return self.zk_conn.read(paths.endpoint(endpoint_name))

    def update_config(self, config):
        self.zk_conn.write(paths.config(), config)

    def update_manager_config(self, manager, config):
        self.zk_conn.write(paths.manager_config(manager), config)

    def get_config(self):
        return self.zk_conn.read(paths.config())

    def get_manager_config(self, manager):
        return self.zk_conn.read(paths.manager_config(manager))

    def remove_manager_config(self, manager):
        return self.zk_conn.delete(paths.manager_config(manager))

    def get_endpoint_ip_addresses(self, endpoint_name):
        """
        Returns all the IP addresses (confirmed or explicitly configured)
        associated with the endpoint.
        """
        ip_addresses = []
        confirmed_ips = self.zk_conn.list_children(\
            paths.confirmed_ips(endpoint_name))
        if confirmed_ips != None:
            ip_addresses += confirmed_ips

        configured_ips = EndpointConfig(\
            self.get_endpoint_config(endpoint_name)).static_ips()
        if configured_ips != None:
            ip_addresses += configured_ips

        return ip_addresses

    def record_new_ip_address(self, ip_address):
        self.zk_conn.delete(paths.new_ip(ip_address))
        self.zk_conn.write(paths.new_ip(ip_address), "")

    def drop_ip_address(self, ip_address):
        self.zk_conn.write(paths.drop_ip(ip_address), "")

    def get_ip_address_endpoint(self, ip_address):
        """
        Returns the endpoint name associated with this ip address.
        """
        return self.zk_conn.read(paths.ip_address(ip_address))

    def auth_hash(self):
        return self.zk_conn.read(paths.auth_hash())

    def set_auth_hash(self, auth_hash):
        if auth_hash:
            self.zk_conn.write(paths.auth_hash(), auth_hash)
        else:
            self.zk_conn.delete(paths.auth_hash())

    def domain(self):
        return self.zk_conn.read(paths.domain())

    def set_domain(self, domain):
        self.zk_conn.write(paths.domain(), domain)