def converge(desired_state, servers_with_cheese, load_balancer_contents, now, timeout=3600): """ Create a :obj:`Convergence` that indicates how to transition from the state provided by the given parameters to the :obj:`DesiredGroupState` described by ``desired_state``. :param DesiredGroupState desired_state: The desired group state. :param list servers_with_cheese: a list of of :obj:`NovaServer` instances. This must only contain servers that are being managed for the specified group. :param dict load_balancer_contents: a dictionary mapping load balancer IDs to lists of 2-tuples of (IP address, loadbalancer node ID). :param float now: number of seconds since the POSIX epoch indicating the time at which the convergence was requested. :param float timeout: Number of seconds after which we will delete a server in BUILD. :rtype: obj:`Convergence` """ newest_to_oldest = sorted(servers_with_cheese, key=lambda s: -s.created) servers_in_error, servers_in_active, servers_in_build = partition_groups( lambda s: s.state, newest_to_oldest, [ERROR, ACTIVE, BUILD]) building_too_long, waiting_for_build = partition_bool( lambda server: now - server.created >= timeout, servers_in_build) create_server = CreateServer(launch_config=desired_state.launch_config) # delete any servers that have been building for too long delete_timeout_steps = [DeleteServer(server_id=server.id) for server in building_too_long] # create servers create_steps = [create_server] * (desired_state.desired - (len(servers_in_active) + len(waiting_for_build))) # delete over capacity, starting with building, then active, # preferring older servers_to_delete = (servers_in_active + waiting_for_build)[desired_state.desired:] delete_steps = [DeleteServer(server_id=server.id) for server in servers_to_delete] # delete all servers in error. delete_error_steps = [DeleteServer(server_id=server.id) for server in servers_in_error] return Convergence( steps=pbag(create_steps + delete_steps + delete_error_steps + delete_timeout_steps ))
def _remove_from_lb_with_draining(timeout, nodes, now): """ Produce a series of steps that will eventually remove all the given nodes. It does this in three steps: For any particular node in ``nodes``: 1. If the timeout is greater than zero, and the node is ``ENABLED``, the node will be changed to ``DRAINING``. 2. If the node is ``DRAINING``, and the timeout (greater than zero) has already expired or there are no more active connections, the node will be removed from the load balancer. If the timeout (greater than zero) has not expired and active connections != 0, then nothing is done to the node. 3. If the node is in any other state other than `DRAINING` or `ENABLED`, or if the timeout is zero, it will be removed from the load balancer. :param float timeout: the time the node should remain in draining until removed :param list nodes: `list` of :obj:`CLBNode` that should be drained, then removed :param float now: number of seconds since the POSIX epoch indicating the time at which the convergence was requested. :rtype: `list` of :class:`IStep` """ to_drain = () in_drain = () # only put nodes into draining if a timeout is specified if timeout > 0: draining, to_drain = partition_bool( lambda node: node.currently_draining(), [node for node in nodes if IDrainable.providedBy(node) and node.is_active()]) # Nothing should be done to these, because the timeout has not expired # and the nodes are still active in_drain = [node for node in draining if not node.is_done_draining(now, timeout)] removes = [remove_node_from_lb(node=node) for node in (set(nodes) - set(to_drain) - set(in_drain))] changes = [drain_lb_node(node=node) for node in to_drain] retry = ( [ConvergeLater(reasons=[ErrorReason.String('draining servers')])] if in_drain else []) return removes + changes + retry
def unchanged_divergent_groups(clock, current, timeout, group_metrics): """ Return list of GroupMetrics that have been divergent and unchanged for timeout seconds :param IReactorTime clock: Twisted time used to track :param dict current: Currently tracked divergent groups :param float timeout: Timeout in seconds :param list group_metrics: List of group metrics :return: (updated current, List of (group, divergent_time) tuples) """ converged, diverged = partition_bool( lambda gm: gm.actual + gm.pending == gm.desired, group_metrics) # stop tracking all converged and deleted groups deleted = set(current.keys()) - metrics_set(group_metrics) updated = current.copy() for g in metrics_set(converged) | deleted: updated.pop(g, None) # Start tracking divergent groups depending on whether they've changed now = clock.seconds() to_log, new = [], {} for gm in diverged: pair = (gm.tenant_id, gm.group_id) if pair in updated: last_time, values = updated[pair] if values != hash((gm.desired, gm.actual, gm.pending)): del updated[pair] continue time_diff = now - last_time if time_diff > timeout and time_diff % timeout <= 60: # log on intervals of timeout. For example, if timeout is 1 hr # then log every hour it remains diverged to_log.append((gm, time_diff)) else: new[pair] = now, hash((gm.desired, gm.actual, gm.pending)) return merge(updated, new), to_log
def converge_launch_server(desired_state, servers_with_cheese, load_balancer_nodes, load_balancers, now, timeout=3600): """ Create steps that indicate how to transition from the state provided by the given parameters to the :obj:`DesiredServerGroupState` described by ``desired_state``. :param DesiredServerGroupState desired_state: The desired group state. :param set servers_with_cheese: a list of :obj:`NovaServer` instances. This must only contain servers that are being managed for the specified group. :param load_balancer_nodes: a set of :obj:`ILBNode` providers. This must contain all the load balancer mappings for all the load balancers (of all types) on the tenant. :param dict load_balancers: Collection of load balancer objects accessed based on its ID. The object is opaque and is not used by planner directly. It is intended to contain extra info for specific LB provider :param float now: number of seconds since the POSIX epoch indicating the time at which the convergence was requested. :param float timeout: Number of seconds after which we will delete a server in BUILD. :rtype: :obj:`pbag` of `IStep` """ newest_to_oldest = sorted(servers_with_cheese, key=lambda s: -s.created) servers = defaultdict(lambda: [], groupby(get_destiny, newest_to_oldest)) servers_in_active = servers[Destiny.CONSIDER_AVAILABLE] building_too_long, waiting_for_build = partition_bool( lambda server: now - server.created >= timeout, servers[Destiny.WAIT_WITH_TIMEOUT]) create_server = CreateServer(server_config=desired_state.server_config) # delete any servers that have been building for too long delete_timeout_steps = [DeleteServer(server_id=server.id) for server in building_too_long] # create servers create_steps = [create_server] * ( desired_state.capacity - ( len(servers_in_active) + len(waiting_for_build) + len(servers[Destiny.WAIT]) + len(servers[Destiny.AVOID_REPLACING]))) # Scale down over capacity, starting with building, then WAIT, then # AVOID_REPLACING, then active, preferring older. Also, finish # draining/deleting servers already in draining state servers_in_preferred_order = ( servers_in_active + servers[Destiny.AVOID_REPLACING] + servers[Destiny.WAIT] + waiting_for_build) servers_to_delete = servers_in_preferred_order[desired_state.capacity:] def drain_and_delete_a_server(server): return _drain_and_delete( server, desired_state.draining_timeout, [node for node in load_balancer_nodes if node.matches(server)], now) try: scale_down_steps = list( mapcat(drain_and_delete_a_server, servers_to_delete + servers[Destiny.DRAIN])) except DrainingUnavailable as de: return pbag([fail_convergence(de)]) # delete all servers in error - draining does not need to be # handled because servers in error presumably are not serving # traffic anyway delete_error_steps = [DeleteServer(server_id=server.id) for server in servers[Destiny.DELETE]] # clean up all the load balancers from deleted and errored servers cleanup_errored_and_deleted_steps = [ remove_node_from_lb(lb_node) for server in servers[Destiny.DELETE] + servers[Destiny.CLEANUP] for lb_node in load_balancer_nodes if lb_node.matches(server)] # converge all the servers that remain to their desired load balancer state still_active_servers = filter(lambda s: s not in servers_to_delete, servers_in_active) try: lb_converge_steps = [ step for server in still_active_servers for step in _converge_lb_state( server, [node for node in load_balancer_nodes if node.matches(server)], load_balancers, now, # Temporarily using build timeout as node offline timeout. # See https://github.com/rackerlabs/otter/issues/1905 timeout) ] except DrainingUnavailable as de: return pbag([fail_convergence(de)]) # Converge again if we expect state transitions on any servers converge_later = [] if any((s not in servers_to_delete for s in waiting_for_build)): converge_later = [ ConvergeLater(reasons=[ErrorReason.String('waiting for servers')])] unavail_fmt = ('Waiting for server {server_id} to transition to ACTIVE ' 'from {status}') reasons = [ErrorReason.UserMessage(unavail_fmt.format(server_id=s.id, status=s.state.name)) for s in servers[Destiny.WAIT] if s not in servers_to_delete] if reasons: converge_later.append(ConvergeLater(limited=True, reasons=reasons)) return pbag(create_steps + scale_down_steps + delete_error_steps + cleanup_errored_and_deleted_steps + delete_timeout_steps + lb_converge_steps + converge_later)