def verify(_): def check_status(): check_d = treq.head( append_segments(server_endpoint, 'servers', server_id), headers=headers(auth_token)) check_d.addCallback(check_success, [404]) return check_d start_time = clock.seconds() # this is treating all errors as transient, so the only error that can # occur is a CancelledError from timing out verify_d = retry_and_timeout(check_status, timeout, next_interval=repeating_interval(interval), clock=clock) def on_success(_): time_delete = clock.seconds() - start_time del_log.msg('Server deleted successfully: {time_delete} seconds.', time_delete=time_delete) verify_d.addCallback(on_success) def on_timeout(_): time_delete = clock.seconds() - start_time del_log.err(None, timeout=timeout, time_delete=time_delete, why=('Server {instance_id} failed to be deleted within ' 'a {timeout} second timeout (it has been ' '{time_delete} seconds).')) verify_d.addErrback(on_timeout)
def wait_for_stack_list(self, expected_states, timeout=180, period=10): def check(content): states = pbag([s['stack_status'] for s in content['stacks']]) if not (states == expected_states): msg("Waiting for group {} to reach desired group state.\n" "{} (actual) {} (expected)".format(self.group.group_id, states, expected_states)) raise TransientRetryError( "Group states of {} did not match expected {})".format( states, expected_states)) msg("Success: desired group state reached:\n{}".format( expected_states)) return self.rcs def poll(): return self.get_stack_list().addCallback(check) expected_states = pbag(expected_states) return retry_and_timeout( poll, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=reactor, deferred_description=( "Waiting for group {} to reach state {}".format( self.group.group_id, str(expected_states))))
def wait_for_state(self, rcs, matcher, timeout=600, period=10, clock=None): """ Wait for the state on the scaling group to match the provided matchers, specified by matcher. :param rcs: a :class:`otter.integration.lib.resources.TestResources` instance :param matcher: A :mod:`testtool.matcher`, as specified in module: testtools.matchers in http://testtools.readthedocs.org/en/latest/api.html. :param timeout: The amount of time to wait until this step is considered failed. :param period: How long to wait before polling again. :param clock: a :class:`twisted.internet.interfaces.IReactorTime` provider :return: None, if the state is reached :raises: :class:`TimedOutError` if the state is never reached within the requisite amount of time. Example usage: ``` matcher = MatchesAll( IncludesServers(included_server_ids), ExcludesServers(exclude_server_ids), ContainsDict({ 'pending': Equals(0), 'desired': Equals(5), 'status': Equals('ACTIVE') }) ) ..wait_for_state(rcs, matchers, timeout=60) ``` """ def check(result): response, group_state = result mismatch = matcher.match(group_state['group']) if mismatch: msg("Waiting for group {} to reach desired group state.\n" "Mismatch: {}" .format(self.group_id, mismatch.describe())) raise TransientRetryError(mismatch.describe()) msg("Success: desired group state reached:\n{}\nmatches:\n{}" .format(group_state['group'], matcher)) return rcs def poll(): return self.get_scaling_group_state(rcs, [200]).addCallback(check) return retry_and_timeout( poll, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock or reactor, deferred_description=( "Waiting for group {} to reach state {}" .format(self.group_id, str(matcher))) )
def wait_for_stack_list(self, expected_states, timeout=180, period=10): def check(content): states = pbag([s['stack_status'] for s in content['stacks']]) if not (states == expected_states): msg("Waiting for group {} to reach desired group state.\n" "{} (actual) {} (expected)" .format(self.group.group_id, states, expected_states)) raise TransientRetryError( "Group states of {} did not match expected {})" .format(states, expected_states)) msg("Success: desired group state reached:\n{}" .format(expected_states)) return self.rcs def poll(): return self.get_stack_list().addCallback(check) expected_states = pbag(expected_states) return retry_and_timeout( poll, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=reactor, deferred_description=( "Waiting for group {} to reach state {}".format( self.group.group_id, str(expected_states))))
def verify(_): def check_status(): check_d = treq.head( append_segments(server_endpoint, 'servers', server_id), headers=headers(auth_token)) check_d.addCallback(check_success, [404]) return check_d start_time = clock.seconds() timeout_description = ( "Waiting for Nova to actually delete server {0}".format(server_id)) verify_d = retry_and_timeout(check_status, timeout, next_interval=repeating_interval(interval), clock=clock, deferred_description=timeout_description) def on_success(_): time_delete = clock.seconds() - start_time del_log.msg('Server deleted successfully: {time_delete} seconds.', time_delete=time_delete) verify_d.addCallback(on_success) verify_d.addErrback(del_log.err)
def wait_for_state(self, rcs, matcher, timeout=600, period=10, clock=None): """ Wait for the state on the scaling group to match the provided matchers, specified by matcher. :param rcs: a :class:`otter.integration.lib.resources.TestResources` instance :param matcher: A :mod:`testtool.matcher`, as specified in module: testtools.matchers in http://testtools.readthedocs.org/en/latest/api.html. :param timeout: The amount of time to wait until this step is considered failed. :param period: How long to wait before polling again. :param clock: a :class:`twisted.internet.interfaces.IReactorTime` provider :return: None, if the state is reached :raises: :class:`TimedOutError` if the state is never reached within the requisite amount of time. Example usage: ``` matcher = MatchesAll( IncludesServers(included_server_ids), ExcludesServers(exclude_server_ids), ContainsDict({ 'pending': Equals(0), 'desired': Equals(5), 'status': Equals('ACTIVE') }) ) ..wait_for_state(rcs, matchers, timeout=60) ``` """ def check(result): response, group_state = result mismatch = matcher.match(group_state['group']) if mismatch: msg("Waiting for group {} to reach desired group state.\n" "Mismatch: {}".format(self.group_id, mismatch.describe())) raise TransientRetryError(mismatch.describe()) msg("Success: desired group state reached:\n{}\nmatches:\n{}". format(group_state['group'], matcher)) return rcs def poll(): return self.get_scaling_group_state(rcs, [200]).addCallback(check) return retry_and_timeout( poll, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock or reactor, deferred_description=( "Waiting for group {} to reach state {}".format( self.group_id, str(matcher))))
def retrier(*args, **kwargs): return retry_and_timeout( partial(f, *args, **kwargs), timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock, deferred_description=reason)
def test_repeating_interval_always_returns_interval(self): """ ``repeating_interval`` returns the same interval no matter what the failure """ next_interval = repeating_interval(3) for exception in (DummyException(), NotImplementedError()): self.assertEqual(next_interval(Failure(exception)), 3)
def retrier(*args, **kwargs): return retry_and_timeout( partial(f, *args, **kwargs), timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock, deferred_description=reason )
def authenticate_tenant(self, tenant_id, log=None): """ see :meth:`IAuthenticator.authenticate_tenant` """ return retry( partial(self._authenticator.authenticate_tenant, tenant_id, log=log), can_retry=retry_times(self._max_retries), next_interval=repeating_interval(self._retry_interval), clock=self._reactor)
def authenticate_tenant(self, tenant_id, log=None): """ see :meth:`IAuthenticator.authenticate_tenant` """ return retry(partial(self._authenticator.authenticate_tenant, tenant_id, log=log), can_retry=retry_times(self._max_retries), next_interval=repeating_interval(self._retry_interval), clock=self._reactor)
def verified_delete(log, server_endpoint, auth_token, server_id, interval=10, timeout=3660, clock=None): """ Attempt to delete a server from the server endpoint, and ensure that it is deleted by trying again until deleting/getting the server results in a 404 or until ``OS-EXT-STS:task_state`` in server details is 'deleting', indicating that Nova has acknowledged that the server is to be deleted as soon as possible. Time out attempting to verify deletes after a period of time and log an error. :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Deletion interval in seconds - how long until verifying a delete is retried. Default: 5. :param int timeout: Seconds after which the deletion will be logged as a failure, if Nova fails to return a 404. Default is 3660, because if the server is building, the delete will not happen until immediately after it has finished building. :return: Deferred that fires when the expected status has been seen. """ serv_log = log.bind(server_id=server_id) serv_log.msg('Deleting server') if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor timeout_description = ( "Waiting for Nova to actually delete server {0} (or acknowledge delete)" .format(server_id)) d = retry_and_timeout( partial(delete_and_verify, serv_log, server_endpoint, auth_token, server_id), timeout, next_interval=repeating_interval(interval), clock=clock, deferred_description=timeout_description) d.addCallback(log_with_time, clock, serv_log, clock.seconds(), ('Server deleted successfully (or acknowledged by Nova as ' 'to-be-deleted) : {time_delete} seconds.'), 'time_delete') d.addErrback(serv_log.err) return d
def add_to_load_balancer(log, endpoint, auth_token, lb_config, ip_address, undo, clock=None): """ Add an IP addressed to a load balancer based on the lb_config. TODO: Handle load balancer node metadata. :param log: A bound logger :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone Auth Token. :param str lb_config: An lb_config dictionary. :param str ip_address: The IP Address of the node to add to the load balancer. :param IUndoStack undo: An IUndoStack to push any reversable operations onto. :return: Deferred that fires with the Add Node to load balancer response as a dict. """ lb_id = lb_config['loadBalancerId'] port = lb_config['port'] path = append_segments(endpoint, 'loadbalancers', str(lb_id), 'nodes') lb_log = log.bind(loadbalancer_id=lb_id) def add(): d = treq.post(path, headers=headers(auth_token), data=json.dumps({"nodes": [{"address": ip_address, "port": port, "condition": "ENABLED", "type": "PRIMARY"}]}), log=lb_log) d.addCallback(check_success, [200, 202]) d.addErrback(log_lb_unexpected_errors, path, lb_log, 'add_node') return d d = retry( add, can_retry=retry_times(config_value('worker.lb_max_retries') or LB_MAX_RETRIES), next_interval=repeating_interval( config_value('worker.lb_retry_interval') or LB_RETRY_INTERVAL), clock=clock) def when_done(result): lb_log.msg('Added to load balancer') undo.push(remove_from_load_balancer, lb_log, endpoint, auth_token, lb_id, result['nodes'][0]['id']) return result return d.addCallback(treq.json_content).addCallback(when_done)
def wait_for_servers(rcs, pool, matcher, group=None, timeout=600, period=10, clock=None, _treq=treq): """ Wait until Nova reaches a particular state (as described by the given matcher) - if a group is provided, then match only the servers for the given group. :param rcs: an instance of :class:`otter.integration.lib.resources.TestResources` :param pool: a :class:`twisted.web.client.HTTPConnectionPool` :param matcher: a :mod:`testtools.matcher` matcher that describes the desired state of the servers belonging to the autoscaling group. :param group: a :class:`otter.integration.lib.autoscale.ScalingGroup` that specifies which autoscaling group's servers we are looking at. This group should already exist, and have a `group_id` attribute. If not provided, the matcher will apply to all servers. """ message = "Waiting for {0} Nova servers".format( "all" if group is None else "group {0} 's".format(group.group_id)) @inlineCallbacks def do_work(): servers = yield list_servers(rcs, pool, _treq=_treq) servers = servers['servers'] if group is not None: servers = [ server for server in servers if (group.group_id == server['metadata'].get( "rax:autoscale:group:id", None)) ] mismatch = matcher.match(servers) if mismatch: msg("{0}.\nMismatch: {1}".format(message, mismatch.describe())) raise TransientRetryError(mismatch.describe()) returnValue(servers) return retry_and_timeout( do_work, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock or reactor, deferred_description=("{0} to reach state {1}".format( message, str(matcher))))
def test_retry_sequence_fails_if_mismatch_sequence(self): """ Fail if the wrong number of performers are given. """ r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: raise_(Exception()), lambda _: raise_(Exception())]) ] self.assertRaises(AssertionError, perform_sequence, seq, Effect(r))
def test_retry_sequence_retries_without_delays(self): """ Perform the wrapped effect with the performers given, without any delay even if the original intent had a delay. """ r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: raise_(Exception()), lambda _: raise_(Exception()), lambda _: "yay done"]) ] self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
def test_do_not_have_to_expect_an_exact_can_retry(self): """ The expected retry intent does not actually have to specify the exact ``can_retry`` function, since it might just be a lambda, which is hard to compare or hash. """ expected = Retry(effect=Effect(1), should_retry=ANY) actual = Retry(effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=lambda _: False, next_interval=repeating_interval(10))) seq = [ retry_sequence(expected, [lambda _: raise_(Exception())]) ] self.assertRaises(Exception, perform_sequence, seq, Effect(actual))
def wait_for_servers(rcs, pool, matcher, group=None, timeout=600, period=10, clock=None, _treq=treq): """ Wait until Nova reaches a particular state (as described by the given matcher) - if a group is provided, then match only the servers for the given group. :param rcs: an instance of :class:`otter.integration.lib.resources.TestResources` :param pool: a :class:`twisted.web.client.HTTPConnectionPool` :param matcher: a :mod:`testtools.matcher` matcher that describes the desired state of the servers belonging to the autoscaling group. :param group: a :class:`otter.integration.lib.autoscale.ScalingGroup` that specifies which autoscaling group's servers we are looking at. This group should already exist, and have a `group_id` attribute. If not provided, the matcher will apply to all servers. """ message = "Waiting for {0} Nova servers".format( "all" if group is None else "group {0} 's".format(group.group_id)) @inlineCallbacks def do_work(): servers = yield list_servers(rcs, pool, _treq=_treq) servers = servers['servers'] if group is not None: servers = [ server for server in servers if (group.group_id == server['metadata'].get("rax:autoscale:group:id", None)) ] mismatch = matcher.match(servers) if mismatch: msg("{0}.\nMismatch: {1}".format(message, mismatch.describe())) raise TransientRetryError(mismatch.describe()) returnValue(servers) return retry_and_timeout( do_work, timeout, can_retry=terminal_errors_except(TransientRetryError), next_interval=repeating_interval(period), clock=clock or reactor, deferred_description=( "{0} to reach state {1}".format(message, str(matcher))) )
def test_fallback(self): """ Accept a ``fallback`` dispatcher that will be used if a performer returns an effect for an intent that is not covered by the base dispatcher. """ def dispatch_2(intent): if intent == 2: return sync_performer(lambda d, i: "yay done") r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: Effect(2)], fallback_dispatcher=ComposedDispatcher( [dispatch_2, base_dispatcher])) ] self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
def delete(self, rcs): """ Delete the server. :param rcs: an instance of :class:`otter.integration.lib.resources.TestResources` """ def try_delete(): d = self.treq.delete( "{}/servers/{}".format(rcs.endpoints["nova"], self.id), headers=headers(str(rcs.token)), pool=self.pool) d.addCallback(check_success, [404], _treq=self.treq) d.addCallback(self.treq.content) return d return retry_and_timeout( try_delete, 120, can_retry=terminal_errors_except(APIError), next_interval=repeating_interval(5), clock=self.clock, deferred_description=( "Waiting for server {} to get deleted".format(self.id)))
def verify(_): def check_status(): check_d = treq.head(append_segments(server_endpoint, 'servers', server_id), headers=headers(auth_token)) check_d.addCallback(check_success, [404]) return check_d start_time = clock.seconds() # this is treating all errors as transient, so the only error that can # occur is a CancelledError from timing out verify_d = retry_and_timeout( check_status, timeout, next_interval=repeating_interval(interval), clock=clock) def on_success(_): time_delete = clock.seconds() - start_time del_log.msg('Server deleted successfully: {time_delete} seconds.', time_delete=time_delete) verify_d.addCallback(on_success) def on_timeout(_): time_delete = clock.seconds() - start_time del_log.err( None, timeout=timeout, time_delete=time_delete, why=('Server {instance_id} failed to be deleted within ' 'a {timeout} second timeout (it has been ' '{time_delete} seconds).')) verify_d.addErrback(on_timeout)
def delete(self, rcs): """ Delete the server. :param rcs: an instance of :class:`otter.integration.lib.resources.TestResources` """ def try_delete(): d = self.treq.delete("{}/servers/{}".format( rcs.endpoints["nova"], self.id), headers=headers(str(rcs.token)), pool=self.pool) d.addCallback(check_success, [404], _treq=self.treq) d.addCallback(self.treq.content) return d return retry_and_timeout( try_delete, 120, can_retry=terminal_errors_except(APIError), next_interval=repeating_interval(5), clock=self.clock, deferred_description=( "Waiting for server {} to get deleted".format(self.id)))
def create_server(server_endpoint, auth_token, server_config, log=None, clock=None, retries=3, create_failure_delay=5, _treq=None): """ Create a new server. If there is an error from Nova from this call, checks to see if the server was created anyway. If not, will retry the create ``retries`` times (checking each time if a server). If the error from Nova is a 400, does not retry, because that implies that retrying will just result in another 400 (bad args). If checking to see if the server is created also results in a failure, does not retry because there might just be something wrong with Nova. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth Token. :param dict server_config: Nova server config. :param: int retries: Number of tries to retry the create. :param: int create_failure_delay: how much time in seconds to wait after a create server failure before checking Nova to see if a server was created :param log: logger :type log: :class:`otter.log.bound.BoundLog` :param _treq: To be used for testing - what treq object to use :type treq: something with the same api as :obj:`treq` :return: Deferred that fires with the CreateServer response as a dict. """ path = append_segments(server_endpoint, 'servers') if _treq is None: # pragma: no cover _treq = treq if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor def _check_results(result, propagated_f): """ Return the original failure, if checking a server resulted in a failure too. Returns a wrapped propagated failure, if there were no servers created, so that the retry utility knows that server creation can be retried. """ if isinstance(result, Failure): log.msg( "Attempt to find a created server in nova resulted in " "{failure}. Propagating the original create error instead.", failure=result) return propagated_f if result is None: raise _NoCreatedServerFound(propagated_f) return result def _check_server_created(f): """ If creating a server failed with anything other than a 400, see if Nova created a server anyway (a 400 means that the server creation args were bad, and there is no point in retrying). If Nova created a server, just return it and pretend that the error never happened. If it didn't, or if checking resulted in another failure response, return a failure of some type. """ f.trap(APIError) if f.value.code == 400: return f d = deferLater(clock, create_failure_delay, find_server, server_endpoint, auth_token, server_config, log=log) d.addBoth(_check_results, f) return d def _create_with_delay(to_delay): d = _treq.post(path, headers=headers(auth_token), data=json.dumps({'server': server_config}), log=log) if to_delay: # Add 1 second delay to space 1 second between server creations d.addCallback(delay, clock, 1) return d def _create_server(): """ Attempt to create a server, handling spurious non-400 errors from Nova by seeing if Nova created a server anyway in spite of the error. If so then create server succeeded. If not, and if no further errors occur, server creation can be retried. """ sem = get_sempahore("create_server", "worker.create_server_limit") if sem is not None: d = sem.run(_create_with_delay, True) else: d = _create_with_delay(False) d.addCallback(check_success, [202], _treq=_treq) d.addCallback(_treq.json_content) d.addErrback(_check_server_created) return d def _unwrap_NoCreatedServerFound(f): """ The original failure was wrapped in a :class:`_NoCreatedServerFound` for ease of retry, but that should not be the final error propagated up by :func:`create_server`. This errback unwraps the :class:`_NoCreatedServerFound` error and returns the original failure. """ f.trap(_NoCreatedServerFound) return f.value.original d = retry(_create_server, can_retry=compose_retries( retry_times(retries), terminal_errors_except(_NoCreatedServerFound)), next_interval=repeating_interval(15), clock=clock) d.addErrback(_unwrap_NoCreatedServerFound) d.addErrback(wrap_request_error, path, 'server_create') return d
def wait_for_active(log, server_endpoint, auth_token, server_id, interval=20, timeout=7200, clock=None): """ Wait until the server specified by server_id's status is 'ACTIVE' :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Polling interval in seconds. Default: 20. :param int timeout: timeout to poll for the server status in seconds. Default 7200 (2 hours). :return: Deferred that fires when the expected status has been seen. """ log.msg("Checking instance status every {interval} seconds", interval=interval) if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor start_time = clock.seconds() def poll(): def check_status(server): status = server['server']['status'] time_building = clock.seconds() - start_time if status == 'ACTIVE': log.msg(("Server changed from 'BUILD' to 'ACTIVE' within " "{time_building} seconds"), time_building=time_building) return server elif status != 'BUILD': log.msg("Server changed to '{status}' in {time_building} seconds", time_building=time_building, status=status) raise UnexpectedServerStatus( server_id, status, 'ACTIVE') else: raise TransientRetryError() # just poll again sd = server_details(server_endpoint, auth_token, server_id, log=log) sd.addCallback(check_status) return sd timeout_description = ("Waiting for server <{0}> to change from BUILD " "state to ACTIVE state").format(server_id) return retry_and_timeout( poll, timeout, can_retry=transient_errors_except(UnexpectedServerStatus, ServerDeleted), next_interval=repeating_interval(interval), clock=clock, deferred_description=timeout_description)
def launch_server(log, request_bag, scaling_group, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param request_bag: An object with a bunch of useful data on it, including a callable to re-auth and get a new token. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) cloudServersOpenStack = config_value('cloudServersOpenStack') server_endpoint = public_endpoint_url(request_bag.service_catalog, cloudServersOpenStack, request_bag.region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def check_metadata(server): # sanity check to make sure the metadata didn't change - can probably # be removed after a while if we do not see any log messages from this # function expected = launch_config['server']['metadata'] result = server['server'].get('metadata') if result != expected: ilog[0].msg('Server metadata has changed.', sanity_check=True, expected_metadata=expected, nova_metadata=result) return server def wait_for_server(server, new_request_bag): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is # fine since verified_delete succeeds on deleted server undo.push( verified_delete, log, server_endpoint, new_request_bag, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active( ilog[0], server_endpoint, new_request_bag.auth_token, server_id).addCallback(check_metadata) def add_lb(server, new_request_bag): if lb_config: lbd = add_to_load_balancers( ilog[0], new_request_bag, lb_config, server, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd return (server, []) def _real_create_server(new_request_bag): auth_token = new_request_bag.auth_token d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server, new_request_bag) d.addCallback(add_lb, new_request_bag) return d def _create_server(): return request_bag.re_auth().addCallback(_real_create_server) def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg('{server_id} errored, deleting and creating new ' 'server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, request_bag, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def create_server(server_endpoint, auth_token, server_config, log=None, clock=None, retries=3, create_failure_delay=5, _treq=None): """ Create a new server. If there is an error from Nova from this call, checks to see if the server was created anyway. If not, will retry the create ``retries`` times (checking each time if a server). If the error from Nova is a 400, does not retry, because that implies that retrying will just result in another 400 (bad args). If checking to see if the server is created also results in a failure, does not retry because there might just be something wrong with Nova. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth Token. :param dict server_config: Nova server config. :param: int retries: Number of tries to retry the create. :param: int create_failure_delay: how much time in seconds to wait after a create server failure before checking Nova to see if a server was created :param log: logger :type log: :class:`otter.log.bound.BoundLog` :param _treq: To be used for testing - what treq object to use :type treq: something with the same api as :obj:`treq` :return: Deferred that fires with the CreateServer response as a dict. """ path = append_segments(server_endpoint, 'servers') if _treq is None: # pragma: no cover _treq = treq if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor def _check_results(result, propagated_f): """ Return the original failure, if checking a server resulted in a failure too. Returns a wrapped propagated failure, if there were no servers created, so that the retry utility knows that server creation can be retried. """ if isinstance(result, Failure): log.msg("Attempt to find a created server in nova resulted in " "{failure}. Propagating the original create error instead.", failure=result) return propagated_f if result is None: raise _NoCreatedServerFound(propagated_f) return result def _check_server_created(f): """ If creating a server failed with anything other than a 400, see if Nova created a server anyway (a 400 means that the server creation args were bad, and there is no point in retrying). If Nova created a server, just return it and pretend that the error never happened. If it didn't, or if checking resulted in another failure response, return a failure of some type. """ f.trap(APIError) if f.value.code == 400: return f d = deferLater(clock, create_failure_delay, find_server, server_endpoint, auth_token, server_config, log=log) d.addBoth(_check_results, f) return d def _create_with_delay(to_delay): d = _treq.post(path, headers=headers(auth_token), data=json.dumps({'server': server_config}), log=log) if to_delay: # Add 1 second delay to space 1 second between server creations d.addCallback(delay, clock, 1) return d def _create_server(): """ Attempt to create a server, handling spurious non-400 errors from Nova by seeing if Nova created a server anyway in spite of the error. If so then create server succeeded. If not, and if no further errors occur, server creation can be retried. """ sem = get_sempahore("create_server", "worker.create_server_limit") if sem is not None: d = sem.run(_create_with_delay, True) else: d = _create_with_delay(False) d.addCallback(check_success, [202], _treq=_treq) d.addCallback(_treq.json_content) d.addErrback(_check_server_created) return d def _unwrap_NoCreatedServerFound(f): """ The original failure was wrapped in a :class:`_NoCreatedServerFound` for ease of retry, but that should not be the final error propagated up by :func:`create_server`. This errback unwraps the :class:`_NoCreatedServerFound` error and returns the original failure. """ f.trap(_NoCreatedServerFound) return f.value.original d = retry( _create_server, can_retry=compose_retries( retry_times(retries), terminal_errors_except(_NoCreatedServerFound)), next_interval=repeating_interval(15), clock=clock) d.addErrback(_unwrap_NoCreatedServerFound) d.addErrback(wrap_request_error, path, 'server_create') return d
def launch_server(log, region, scaling_group, service_catalog, auth_token, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param str region: A rackspace region as found in the service catalog. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param list service_catalog: A list of services as returned by the auth apis. :param str auth_token: The user's auth token. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) lb_region = config_value('regionOverrides.cloudLoadBalancers') or region cloudLoadBalancers = config_value('cloudLoadBalancers') cloudServersOpenStack = config_value('cloudServersOpenStack') lb_endpoint = public_endpoint_url(service_catalog, cloudLoadBalancers, lb_region) server_endpoint = public_endpoint_url(service_catalog, cloudServersOpenStack, region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def wait_for_server(server): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is fine # since verified_delete succeeds on deleted server undo.push( verified_delete, log, server_endpoint, auth_token, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active( ilog[0], server_endpoint, auth_token, server_id) def add_lb(server): ip_address = private_ip_addresses(server)[0] lbd = add_to_load_balancers( ilog[0], lb_endpoint, auth_token, lb_config, ip_address, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd def _create_server(): d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server) d.addCallback(add_lb) return d def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg('{server_id} errored, deleting and creating new server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, auth_token, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def verified_delete(log, server_endpoint, auth_token, server_id, interval=10, timeout=3660, clock=None): """ Attempt to delete a server from the server endpoint, and ensure that it is deleted by trying again until deleting the server results in a 404. Time out attempting to verify deletes after a period of time and log an error. :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Deletion interval in seconds - how long until verifying a delete is retried. Default: 5. :param int timeout: Seconds after which the deletion will be logged as a failure, if Nova fails to return a 404. Default is 3660, because if the server is building, the delete will not happen until immediately after it has finished building. :return: Deferred that fires when the expected status has been seen. """ serv_log = log.bind(server_id=server_id) serv_log.msg('Deleting server') path = append_segments(server_endpoint, 'servers', server_id) if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor # just delete over and over until a 404 is received def delete(): del_d = treq.delete(path, headers=headers(auth_token), log=serv_log) del_d.addCallback(check_success, [404]) del_d.addCallback(treq.content) return del_d start_time = clock.seconds() timeout_description = ( "Waiting for Nova to actually delete server {0}".format(server_id)) d = retry_and_timeout(delete, timeout, next_interval=repeating_interval(interval), clock=clock, deferred_description=timeout_description) def on_success(_): time_delete = clock.seconds() - start_time serv_log.msg('Server deleted successfully: {time_delete} seconds.', time_delete=time_delete) d.addCallback(on_success) d.addErrback(serv_log.err) return d
def launch_server(log, request_bag, scaling_group, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param request_bag: An object with a bunch of useful data on it, including a callable to re-auth and get a new token. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) cloudServersOpenStack = config_value('cloudServersOpenStack') server_endpoint = public_endpoint_url(request_bag.service_catalog, cloudServersOpenStack, request_bag.region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def check_metadata(server): # sanity check to make sure the metadata didn't change - can probably # be removed after a while if we do not see any log messages from this # function expected = launch_config['server']['metadata'] result = server['server'].get('metadata') if result != expected: ilog[0].msg('Server metadata has changed.', sanity_check=True, expected_metadata=expected, nova_metadata=result) return server def wait_for_server(server, new_request_bag): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is # fine since verified_delete succeeds on deleted server undo.push(verified_delete, log, server_endpoint, new_request_bag, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active(ilog[0], server_endpoint, new_request_bag.auth_token, server_id).addCallback(check_metadata) def add_lb(server, new_request_bag): if lb_config: lbd = add_to_load_balancers(ilog[0], new_request_bag, lb_config, server, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd return (server, []) def _real_create_server(new_request_bag): auth_token = new_request_bag.auth_token d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server, new_request_bag) d.addCallback(add_lb, new_request_bag) return d def _create_server(): return request_bag.re_auth().addCallback(_real_create_server) def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg( '{server_id} errored, deleting and creating new ' 'server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, request_bag, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def wait_for_active(log, server_endpoint, auth_token, server_id, interval=5, timeout=3600, clock=None): """ Wait until the server specified by server_id's status is 'ACTIVE' :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Polling interval in seconds. Default: 5. :param int timeout: timeout to poll for the server status in seconds. Default 3600 (1 hour) :return: Deferred that fires when the expected status has been seen. """ log.msg("Checking instance status every {interval} seconds", interval=interval) if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor start_time = clock.seconds() def poll(): def check_status(server): status = server['server']['status'] if status == 'ACTIVE': time_building = clock.seconds() - start_time log.msg(("Server changed from 'BUILD' to 'ACTIVE' within " "{time_building} seconds"), time_building=time_building) return server elif status != 'BUILD': raise UnexpectedServerStatus(server_id, status, 'ACTIVE') else: raise TransientRetryError() # just poll again sd = server_details(server_endpoint, auth_token, server_id) sd.addCallback(check_status) return sd d = retry_and_timeout( poll, timeout, can_retry=transient_errors_except(UnexpectedServerStatus), next_interval=repeating_interval(interval), clock=clock) def on_error(f): if f.check(CancelledError): time_building = clock.seconds() - start_time log.msg( ('Server {instance_id} failed to change from BUILD state ' 'to ACTIVE within a {timeout} second timeout (it has been ' '{time_building} seconds).'), timeout=timeout, time_building=time_building) return f d.addErrback(on_error) return d
def wait_for_active(log, server_endpoint, auth_token, server_id, interval=20, timeout=7200, clock=None): """ Wait until the server specified by server_id's status is 'ACTIVE' :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Polling interval in seconds. Default: 20. :param int timeout: timeout to poll for the server status in seconds. Default 7200 (2 hours). :return: Deferred that fires when the expected status has been seen. """ log.msg("Checking instance status every {interval} seconds", interval=interval) if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor start_time = clock.seconds() def poll(): def check_status(server): status = server['server']['status'] time_building = clock.seconds() - start_time if status == 'ACTIVE': log.msg(("Server changed from 'BUILD' to 'ACTIVE' within " "{time_building} seconds"), time_building=time_building) return server elif status != 'BUILD': log.msg( "Server changed to '{status}' in {time_building} seconds", time_building=time_building, status=status) raise UnexpectedServerStatus(server_id, status, 'ACTIVE') else: raise TransientRetryError() # just poll again sd = server_details(server_endpoint, auth_token, server_id, log=log) sd.addCallback(check_status) return sd timeout_description = ("Waiting for server <{0}> to change from BUILD " "state to ACTIVE state").format(server_id) return retry_and_timeout(poll, timeout, can_retry=transient_errors_except( UnexpectedServerStatus, ServerDeleted), next_interval=repeating_interval(interval), clock=clock, deferred_description=timeout_description)
def wait_for_active(log, server_endpoint, auth_token, server_id, interval=5, timeout=3600, clock=None): """ Wait until the server specified by server_id's status is 'ACTIVE' :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int interval: Polling interval in seconds. Default: 5. :param int timeout: timeout to poll for the server status in seconds. Default 3600 (1 hour) :return: Deferred that fires when the expected status has been seen. """ log.msg("Checking instance status every {interval} seconds", interval=interval) if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor start_time = clock.seconds() def poll(): def check_status(server): status = server['server']['status'] if status == 'ACTIVE': time_building = clock.seconds() - start_time log.msg(("Server changed from 'BUILD' to 'ACTIVE' within " "{time_building} seconds"), time_building=time_building) return server elif status != 'BUILD': raise UnexpectedServerStatus( server_id, status, 'ACTIVE') else: raise TransientRetryError() # just poll again sd = server_details(server_endpoint, auth_token, server_id) sd.addCallback(check_status) return sd d = retry_and_timeout( poll, timeout, can_retry=transient_errors_except(UnexpectedServerStatus), next_interval=repeating_interval(interval), clock=clock) def on_error(f): if f.check(CancelledError): time_building = clock.seconds() - start_time log.msg(('Server {instance_id} failed to change from BUILD state ' 'to ACTIVE within a {timeout} second timeout (it has been ' '{time_building} seconds).'), timeout=timeout, time_building=time_building) return f d.addErrback(on_error) return d