def node_feed_req(lb_id, node_id, response): """ Return (intent, performer) sequence for getting clb node's feed that wrapped with retry intent. :param lb_id: Lodbalancer ID :param node_id: LB node ID :param response: The response returned when getting CLB node feed. It is either string containing feed or Exception object that will be raised when getting the feed :return: (intent, performer) tuple """ if isinstance(response, Exception): def handler(i): raise response else: def handler(i): return response return (Retry(effect=mock.ANY, should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=exponential_backoff_interval(2))), nested_sequence([(("gcnf", lb_id, node_id), handler)]))
def lb_req(url, json_response, response): """ Return a SequenceDispatcher two-tuple that matches a service request to a particular load balancer endpoint (using GET), and returns the given ``response`` as the content in an HTTP 200 ``StubResponse``. """ if isinstance(response, Exception): def handler(i): raise response log_seq = [] else: def handler(i): return (StubResponse(200, {}), response) log_seq = [(Log(mock.ANY, mock.ANY), lambda i: None)] return ( Retry( effect=mock.ANY, should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=exponential_backoff_interval(2)) ), nested_sequence([ (service_request( ServiceType.CLOUD_LOAD_BALANCERS, 'GET', url, json_response=json_response).intent, handler) ] + log_seq) )
def node_feed_req(lb_id, node_id, response): """ Return (intent, performer) sequence for getting clb node's feed that wrapped with retry intent. :param lb_id: Lodbalancer ID :param node_id: LB node ID :param response: The response returned when getting CLB node feed. It is either string containing feed or Exception object that will be raised when getting the feed :return: (intent, performer) tuple """ if isinstance(response, Exception): def handler(i): raise response else: def handler(i): return response return ( Retry( effect=mock.ANY, should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=exponential_backoff_interval(2)) ), nested_sequence([(("gcnf", lb_id, node_id), handler)]) )
def test_retry_times(self): """ `retry_times` returns function that will retry given number of times """ can_retry = retry_times(3) for exception in (DummyException(), NotImplementedError(), ValueError()): self.assertTrue(can_retry(Failure(exception))) self.assertFalse(can_retry(Failure(DummyException())))
def authenticate_tenant(self, tenant_id, log=None): """ see :meth:`IAuthenticator.authenticate_tenant` """ return retry( partial(self._authenticator.authenticate_tenant, tenant_id, log=log), can_retry=retry_times(self._max_retries), next_interval=repeating_interval(self._retry_interval), clock=self._reactor)
def convergence_remove_server_from_group(log, transaction_id, server_id, replace, purge, group, state): """ Remove a specific server from the group, optionally decrementing the desired capacity. The server may just be scheduled for deletion, or it may be evicted from the group by removing otter-specific metdata from the server. :param log: A bound logger :param bytes trans_id: The transaction id for this operation. :param bytes server_id: The id of the server to be removed. :param bool replace: Should the server be replaced? :param bool purge: Should the server be deleted from Nova? :param group: The scaling group to remove a server from. :type group: :class:`~otter.models.interface.IScalingGroup` :param state: The current state of the group. :type state: :class:`~otter.models.interface.GroupState` :return: The updated state. :rtype: Effect of :class:`~otter.models.interface.GroupState` :raise: :class:`CannotDeleteServerBelowMinError` if the server cannot be deleted without replacement, and :class:`ServerNotFoundError` if there is no such server to be deleted. """ effects = [_is_server_in_group(group, server_id)] if not replace: effects.append(_can_scale_down(group, server_id)) # the (possibly) two checks can happen in parallel, but we want # ServerNotFoundError to take precedence over # CannotDeleteServerBelowMinError both_checks = yield parallel_all_errors(effects) for is_error, result in both_checks: if is_error: reraise(*result) # Remove the server if purge: eff = set_nova_metadata_item(server_id, *DRAINING_METADATA) else: eff = Effect( EvictServerFromScalingGroup(log=log, transaction_id=transaction_id, scaling_group=group, server_id=server_id)) yield Effect( TenantScope( retry_effect(eff, retry_times(3), exponential_backoff_interval(2)), group.tenant_id)) if not replace: yield do_return(assoc_obj(state, desired=state.desired - 1)) else: yield do_return(state)
def remove_from_load_balancer(log, endpoint, auth_token, loadbalancer_id, node_id, clock=None): """ Remove a node from a load balancer. :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone Auth Token. :param str loadbalancer_id: The ID for a cloud loadbalancer. :param str node_id: The ID for a node in that cloudloadbalancer. :returns: A Deferred that fires with None if the operation completed successfully, or errbacks with an RequestError. """ lb_log = log.bind(loadbalancer_id=loadbalancer_id, node_id=node_id) # TODO: Will remove this once LB ERROR state is fixed and it is working fine lb_log.msg('Removing from load balancer') path = append_segments(endpoint, 'loadbalancers', str(loadbalancer_id), 'nodes', str(node_id)) def check_422_deleted(failure): # A LB being deleted sometimes results in a 422. This function # unfortunately has to parse the body of the message to see if this is an # acceptable 422 (if the LB has been deleted or the node has already been # removed, then 'removing from load balancer' as a task should be # successful - if the LB is in ERROR, then nothing more can be done to # it except resetting it - may as well remove the server.) failure.trap(APIError) error = failure.value if error.code == 422: message = json.loads(error.body)['message'] if ('load balancer is deleted' not in message and 'PENDING_DELETE' not in message): return failure lb_log.msg(message) else: return failure def remove(): d = treq.delete(path, headers=headers(auth_token), log=lb_log) # Success is 200/202. An LB not being found is 404. A node not being # found is a 404. But a deleted LB sometimes results in a 422. d.addCallback(log_on_response_code, lb_log, 'Node to delete does not exist', 404) d.addCallback(check_success, [200, 202, 404]) d.addCallback(treq.content) # To avoid https://twistedmatrix.com/trac/ticket/6751 d.addErrback(check_422_deleted) d.addErrback(log_lb_unexpected_errors, path, lb_log, 'remove_node') return d d = retry( remove, can_retry=retry_times(config_value('worker.lb_max_retries') or LB_MAX_RETRIES), next_interval=random_interval( *(config_value('worker.lb_retry_interval_range') or LB_RETRY_INTERVAL_RANGE)), clock=clock) d.addCallback(lambda _: lb_log.msg('Removed from load balancer')) return d
def authenticate_tenant(self, tenant_id, log=None): """ see :meth:`IAuthenticator.authenticate_tenant` """ return retry(partial(self._authenticator.authenticate_tenant, tenant_id, log=log), can_retry=retry_times(self._max_retries), next_interval=repeating_interval(self._retry_interval), clock=self._reactor)
def add_to_load_balancer(log, endpoint, auth_token, lb_config, ip_address, undo, clock=None): """ Add an IP addressed to a load balancer based on the lb_config. TODO: Handle load balancer node metadata. :param log: A bound logger :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone Auth Token. :param str lb_config: An lb_config dictionary. :param str ip_address: The IP Address of the node to add to the load balancer. :param IUndoStack undo: An IUndoStack to push any reversable operations onto. :return: Deferred that fires with the Add Node to load balancer response as a dict. """ lb_id = lb_config['loadBalancerId'] port = lb_config['port'] path = append_segments(endpoint, 'loadbalancers', str(lb_id), 'nodes') lb_log = log.bind(loadbalancer_id=lb_id, ip_address=ip_address) def add(): d = treq.post(path, headers=headers(auth_token), data=json.dumps({"nodes": [{"address": ip_address, "port": port, "condition": "ENABLED", "type": "PRIMARY"}]}), log=lb_log) d.addCallback(check_success, [200, 202]) d.addErrback(log_lb_unexpected_errors, lb_log, 'add_node') d.addErrback(wrap_request_error, path, 'add_node') d.addErrback(check_deleted_clb, lb_id) return d d = retry( add, can_retry=compose_retries( transient_errors_except(CLBOrNodeDeleted), retry_times(config_value('worker.lb_max_retries') or LB_MAX_RETRIES)), next_interval=random_interval( *(config_value('worker.lb_retry_interval_range') or LB_RETRY_INTERVAL_RANGE)), clock=clock) def when_done(result): lb_log.msg('Added to load balancer', node_id=result['nodes'][0]['id']) undo.push(remove_from_load_balancer, lb_log, endpoint, auth_token, lb_id, result['nodes'][0]['id']) return result return d.addCallback(treq.json_content).addCallback(when_done)
def convergence_remove_server_from_group( log, transaction_id, server_id, replace, purge, group, state): """ Remove a specific server from the group, optionally decrementing the desired capacity. The server may just be scheduled for deletion, or it may be evicted from the group by removing otter-specific metdata from the server. :param log: A bound logger :param bytes trans_id: The transaction id for this operation. :param bytes server_id: The id of the server to be removed. :param bool replace: Should the server be replaced? :param bool purge: Should the server be deleted from Nova? :param group: The scaling group to remove a server from. :type group: :class:`~otter.models.interface.IScalingGroup` :param state: The current state of the group. :type state: :class:`~otter.models.interface.GroupState` :return: The updated state. :rtype: Effect of :class:`~otter.models.interface.GroupState` :raise: :class:`CannotDeleteServerBelowMinError` if the server cannot be deleted without replacement, and :class:`ServerNotFoundError` if there is no such server to be deleted. """ effects = [_is_server_in_group(group, server_id)] if not replace: effects.append(_can_scale_down(group, server_id)) # the (possibly) two checks can happen in parallel, but we want # ServerNotFoundError to take precedence over # CannotDeleteServerBelowMinError both_checks = yield parallel_all_errors(effects) for is_error, result in both_checks: if is_error: reraise(*result) # Remove the server if purge: eff = set_nova_metadata_item(server_id, *DRAINING_METADATA) else: eff = Effect( EvictServerFromScalingGroup(log=log, transaction_id=transaction_id, scaling_group=group, server_id=server_id)) yield Effect(TenantScope( retry_effect(eff, retry_times(3), exponential_backoff_interval(2)), group.tenant_id)) if not replace: yield do_return(assoc_obj(state, desired=state.desired - 1)) else: yield do_return(state)
def as_effect(self): """Produce a :obj:`Effect` to delete a server.""" eff = retry_effect( delete_and_verify(self.server_id), can_retry=retry_times(3), next_interval=exponential_backoff_interval(2)) def report_success(result): return StepResult.RETRY, [ ErrorReason.String( 'must re-gather after deletion in order to update the ' 'active cache')] return eff.on(success=report_success)
def _remove_from_clb(log, endpoint, auth_token, loadbalancer_id, node_id, clock=None): """ Remove a node from a CLB load balancer. :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone authentication token. :param str loadbalancer_id: The ID for a Cloud Load Balancer. :param str node_id: The ID for a node in that Cloud Load Balancer. :returns: A Deferred that fires with None if the operation completed successfully, or errbacks with an RequestError. """ lb_log = log.bind(loadbalancer_id=loadbalancer_id, node_id=node_id) # TODO: Will remove this once LB ERROR state is fixed and it is working fine lb_log.msg('Removing from load balancer') path = append_segments(endpoint, 'loadbalancers', str(loadbalancer_id), 'nodes', str(node_id)) def remove(): d = treq.delete(path, headers=headers(auth_token), log=lb_log) d.addCallback(check_success, [200, 202]) d.addCallback(treq.content ) # To avoid https://twistedmatrix.com/trac/ticket/6751 d.addErrback(log_lb_unexpected_errors, lb_log, 'remove_node') d.addErrback(wrap_request_error, path, 'remove_node') d.addErrback(check_deleted_clb, loadbalancer_id, node_id) return d d = retry(remove, can_retry=compose_retries( transient_errors_except(CLBOrNodeDeleted), retry_times( config_value('worker.lb_max_retries') or LB_MAX_RETRIES)), next_interval=random_interval( *(config_value('worker.lb_retry_interval_range') or LB_RETRY_INTERVAL_RANGE)), clock=clock) # A node or CLB deleted is considered successful removal d.addErrback( lambda f: f.trap(CLBOrNodeDeleted) and lb_log.msg(f.value.message)) d.addCallback(lambda _: lb_log.msg('Removed from load balancer')) return d
def test_retry_sequence_fails_if_mismatch_sequence(self): """ Fail if the wrong number of performers are given. """ r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: raise_(Exception()), lambda _: raise_(Exception())]) ] self.assertRaises(AssertionError, perform_sequence, seq, Effect(r))
def test_retry_sequence_retries_without_delays(self): """ Perform the wrapped effect with the performers given, without any delay even if the original intent had a delay. """ r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: raise_(Exception()), lambda _: raise_(Exception()), lambda _: "yay done"]) ] self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
def add_event(event, admin_tenant_id, region, log): """ Add event to cloud feeds """ event, error, timestamp, event_tenant_id, event_id = sanitize_event(event) req = prepare_request(request_format, event, error, timestamp, region, event_tenant_id, event_id) eff = retry_effect( publish_to_cloudfeeds(req, log=log), compose_retries( lambda f: (not f.check(APIError) or f.value.code < 400 or f.value.code >= 500), retry_times(5)), exponential_backoff_interval(2)) return Effect(TenantScope(tenant_id=admin_tenant_id, effect=eff))
def add_event(event, admin_tenant_id, region, log): """ Add event to cloud feeds """ event, error, timestamp, event_tenant_id, event_id = sanitize_event(event) req = prepare_request(request_format, event, error, timestamp, region, event_tenant_id, event_id) eff = retry_effect( publish_autoscale_event(req, log=log), compose_retries( lambda f: (not f.check(APIError) or f.value.code < 400 or f.value.code >= 500), retry_times(5)), exponential_backoff_interval(2)) return Effect(TenantScope(tenant_id=admin_tenant_id, effect=eff))
def verified_delete(log, server_endpoint, request_bag, server_id, exp_start=2, max_retries=10, clock=None): """ Attempt to delete a server from the server endpoint, and ensure that it is deleted by trying again until deleting/getting the server results in a 404 or until ``OS-EXT-STS:task_state`` in server details is 'deleting', indicating that Nova has acknowledged that the server is to be deleted as soon as possible. Time out attempting to verify deletes after a period of time and log an error. :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int exp_start: Exponential backoff interval start seconds. Default 2 :param int max_retries: Maximum number of retry attempts :return: Deferred that fires when the expected status has been seen. """ serv_log = log.bind(server_id=server_id) serv_log.msg('Deleting server') if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor d = retry( partial(delete_and_verify, serv_log, server_endpoint, request_bag, server_id, clock), can_retry=retry_times(max_retries), next_interval=exponential_backoff_interval(exp_start), clock=clock) d.addCallback(log_with_time, clock, serv_log, clock.seconds(), ('Server deleted successfully (or acknowledged by Nova as ' 'to-be-deleted) : {time_delete} seconds.'), 'time_delete') return d
def verified_delete(log, server_endpoint, request_bag, server_id, exp_start=2, max_retries=10, clock=None): """ Attempt to delete a server from the server endpoint, and ensure that it is deleted by trying again until deleting/getting the server results in a 404 or until ``OS-EXT-STS:task_state`` in server details is 'deleting', indicating that Nova has acknowledged that the server is to be deleted as soon as possible. Time out attempting to verify deletes after a period of time and log an error. :param log: A bound logger. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth token. :param str server_id: Opaque nova server id. :param int exp_start: Exponential backoff interval start seconds. Default 2 :param int max_retries: Maximum number of retry attempts :return: Deferred that fires when the expected status has been seen. """ serv_log = log.bind(server_id=server_id) serv_log.msg('Deleting server') if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor d = retry(partial(delete_and_verify, serv_log, server_endpoint, request_bag, server_id, clock), can_retry=retry_times(max_retries), next_interval=exponential_backoff_interval(exp_start), clock=clock) d.addCallback(log_with_time, clock, serv_log, clock.seconds(), ('Server deleted successfully (or acknowledged by Nova as ' 'to-be-deleted) : {time_delete} seconds.'), 'time_delete') return d
def _is_server_in_group(group, server_id): """ Given a group and server ID, determines if the server is a member of the group. If it isn't, it raises a :class:`ServerNotFoundError`. """ try: response, server_info = yield Effect( TenantScope( retry_effect(get_server_details(server_id), retry_times(3), exponential_backoff_interval(2)), group.tenant_id)) except NoSuchServerError: raise ServerNotFoundError(group.tenant_id, group.uuid, server_id) group_id = group_id_from_metadata( get_in(('server', 'metadata'), server_info, {})) if group_id != group.uuid: raise ServerNotFoundError(group.tenant_id, group.uuid, server_id)
def _is_server_in_group(group, server_id): """ Given a group and server ID, determines if the server is a member of the group. If it isn't, it raises a :class:`ServerNotFoundError`. """ try: response, server_info = yield Effect( TenantScope( retry_effect(get_server_details(server_id), retry_times(3), exponential_backoff_interval(2)), group.tenant_id, ) ) except NoSuchServerError: raise ServerNotFoundError(group.tenant_id, group.uuid, server_id) group_id = group_id_from_metadata(get_in(("server", "metadata"), server_info, {})) if group_id != group.uuid: raise ServerNotFoundError(group.tenant_id, group.uuid, server_id)
def test_delete_server(self, mock_dav): """ :obj:`DeleteServer.as_effect` calls `delete_and_verify` with retries. It returns SUCCESS on completion and RETRY on failure """ mock_dav.side_effect = lambda sid: Effect(sid) eff = DeleteServer(server_id='abc123').as_effect() self.assertIsInstance(eff.intent, Retry) self.assertEqual( eff.intent.should_retry, ShouldDelayAndRetry(can_retry=retry_times(3), next_interval=exponential_backoff_interval(2))) self.assertEqual(eff.intent.effect.intent, 'abc123') self.assertEqual( resolve_effect(eff, (None, {})), (StepResult.RETRY, [ErrorReason.String('must re-gather after deletion in order to ' 'update the active cache')]))
def remove_from_load_balancer(log, endpoint, auth_token, loadbalancer_id, node_id, clock=None): """ Remove a node from a load balancer. :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone Auth Token. :param str loadbalancer_id: The ID for a cloud loadbalancer. :param str node_id: The ID for a node in that cloudloadbalancer. :returns: A Deferred that fires with None if the operation completed successfully, or errbacks with an RequestError. """ lb_log = log.bind(loadbalancer_id=loadbalancer_id, node_id=node_id) # TODO: Will remove this once LB ERROR state is fixed and it is working fine lb_log.msg('Removing from load balancer') path = append_segments(endpoint, 'loadbalancers', str(loadbalancer_id), 'nodes', str(node_id)) def remove(): d = treq.delete(path, headers=headers(auth_token), log=lb_log) d.addCallback(check_success, [200, 202]) d.addCallback(treq.content) # To avoid https://twistedmatrix.com/trac/ticket/6751 d.addErrback(log_lb_unexpected_errors, lb_log, 'remove_node') d.addErrback(wrap_request_error, path, 'remove_node') d.addErrback(check_deleted_clb, loadbalancer_id, node_id) return d d = retry( remove, can_retry=compose_retries( transient_errors_except(CLBOrNodeDeleted), retry_times(config_value('worker.lb_max_retries') or LB_MAX_RETRIES)), next_interval=random_interval( *(config_value('worker.lb_retry_interval_range') or LB_RETRY_INTERVAL_RANGE)), clock=clock) # A node or CLB deleted is considered successful removal d.addErrback(lambda f: f.trap(CLBOrNodeDeleted) and lb_log.msg(f.value.message)) d.addCallback(lambda _: lb_log.msg('Removed from load balancer')) return d
def test_fallback(self): """ Accept a ``fallback`` dispatcher that will be used if a performer returns an effect for an intent that is not covered by the base dispatcher. """ def dispatch_2(intent): if intent == 2: return sync_performer(lambda d, i: "yay done") r = Retry( effect=Effect(1), should_retry=ShouldDelayAndRetry( can_retry=retry_times(5), next_interval=repeating_interval(10))) seq = [ retry_sequence(r, [lambda _: Effect(2)], fallback_dispatcher=ComposedDispatcher( [dispatch_2, base_dispatcher])) ] self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
def get_all_server_details(tenant_id, authenticator, service_name, region, limit=100, clock=None, _treq=None): """ Return all servers of a tenant TODO: service_name is possibly internal to this function but I don't want to pass config here? NOTE: This really screams to be a independent txcloud-type API """ token, catalog = yield authenticator.authenticate_tenant(tenant_id, log=default_log) endpoint = public_endpoint_url(catalog, service_name, region) url = append_segments(endpoint, 'servers', 'detail') query = {'limit': limit} all_servers = [] if clock is None: # pragma: no cover from twisted.internet import reactor as clock if _treq is None: # pragma: no cover _treq = treq def fetch(url, headers): d = _treq.get(url, headers=headers) d.addCallback(check_success, [200], _treq=_treq) d.addCallback(_treq.json_content) return d while True: # sort based on query name to make the tests predictable urlparams = sorted(query.items(), key=lambda e: e[0]) d = retry(partial(fetch, '{}?{}'.format(url, urlencode(urlparams)), headers(token)), can_retry=retry_times(5), next_interval=exponential_backoff_interval(2), clock=clock) servers = (yield d)['servers'] all_servers.extend(servers) if len(servers) < limit: break query.update({'marker': servers[-1]['id']}) defer.returnValue(all_servers)
def launch_server(log, request_bag, scaling_group, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param request_bag: An object with a bunch of useful data on it, including a callable to re-auth and get a new token. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) cloudServersOpenStack = config_value('cloudServersOpenStack') server_endpoint = public_endpoint_url(request_bag.service_catalog, cloudServersOpenStack, request_bag.region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def check_metadata(server): # sanity check to make sure the metadata didn't change - can probably # be removed after a while if we do not see any log messages from this # function expected = launch_config['server']['metadata'] result = server['server'].get('metadata') if result != expected: ilog[0].msg('Server metadata has changed.', sanity_check=True, expected_metadata=expected, nova_metadata=result) return server def wait_for_server(server, new_request_bag): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is # fine since verified_delete succeeds on deleted server undo.push(verified_delete, log, server_endpoint, new_request_bag, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active(ilog[0], server_endpoint, new_request_bag.auth_token, server_id).addCallback(check_metadata) def add_lb(server, new_request_bag): if lb_config: lbd = add_to_load_balancers(ilog[0], new_request_bag, lb_config, server, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd return (server, []) def _real_create_server(new_request_bag): auth_token = new_request_bag.auth_token d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server, new_request_bag) d.addCallback(add_lb, new_request_bag) return d def _create_server(): return request_bag.re_auth().addCallback(_real_create_server) def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg( '{server_id} errored, deleting and creating new ' 'server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, request_bag, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def add_to_clb(log, endpoint, auth_token, lb_config, ip_address, undo, clock=None): """ Add an IP address to a Cloud Load Balancer based on the ``lb_config``. TODO: Handle load balancer node metadata. :param log: A bound logger :param str endpoint: Load balancer endpoint URI. :param str auth_token: Keystone auth token. :param dict lb_config: An ``lb_config`` dictionary. :param str ip_address: The IP address of the node to add to the load balancer. :param IUndoStack undo: An IUndoStack to push any reversable operations onto. :return: Deferred that fires with the load balancer response. """ lb_id = lb_config['loadBalancerId'] port = lb_config['port'] path = append_segments(endpoint, 'loadbalancers', str(lb_id), 'nodes') lb_log = log.bind(loadbalancer_id=lb_id, ip_address=ip_address) def add(): d = treq.post(path, headers=headers(auth_token), data=json.dumps({ "nodes": [{ "address": ip_address, "port": port, "condition": "ENABLED", "type": "PRIMARY" }] }), log=lb_log) d.addCallback(check_success, [200, 202]) d.addErrback(log_lb_unexpected_errors, lb_log, 'add_node') d.addErrback(wrap_request_error, path, 'add_node') d.addErrback(check_deleted_clb, lb_id) return d d = retry(add, can_retry=compose_retries( transient_errors_except(CLBOrNodeDeleted), retry_times( config_value('worker.lb_max_retries') or LB_MAX_RETRIES)), next_interval=random_interval( *(config_value('worker.lb_retry_interval_range') or LB_RETRY_INTERVAL_RANGE)), clock=clock) def when_done(result): node_id = result['nodes'][0]['id'] lb_log.msg('Added to load balancer', node_id=node_id) undo.push(_remove_from_clb, lb_log, endpoint, auth_token, lb_id, node_id) return result return d.addCallback(treq.json_content).addCallback(when_done)
def create_server(server_endpoint, auth_token, server_config, log=None, clock=None, retries=3, create_failure_delay=5, _treq=None): """ Create a new server. If there is an error from Nova from this call, checks to see if the server was created anyway. If not, will retry the create ``retries`` times (checking each time if a server). If the error from Nova is a 400, does not retry, because that implies that retrying will just result in another 400 (bad args). If checking to see if the server is created also results in a failure, does not retry because there might just be something wrong with Nova. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth Token. :param dict server_config: Nova server config. :param: int retries: Number of tries to retry the create. :param: int create_failure_delay: how much time in seconds to wait after a create server failure before checking Nova to see if a server was created :param log: logger :type log: :class:`otter.log.bound.BoundLog` :param _treq: To be used for testing - what treq object to use :type treq: something with the same api as :obj:`treq` :return: Deferred that fires with the CreateServer response as a dict. """ path = append_segments(server_endpoint, 'servers') if _treq is None: # pragma: no cover _treq = treq if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor def _check_results(result, propagated_f): """ Return the original failure, if checking a server resulted in a failure too. Returns a wrapped propagated failure, if there were no servers created, so that the retry utility knows that server creation can be retried. """ if isinstance(result, Failure): log.msg( "Attempt to find a created server in nova resulted in " "{failure}. Propagating the original create error instead.", failure=result) return propagated_f if result is None: raise _NoCreatedServerFound(propagated_f) return result def _check_server_created(f): """ If creating a server failed with anything other than a 400, see if Nova created a server anyway (a 400 means that the server creation args were bad, and there is no point in retrying). If Nova created a server, just return it and pretend that the error never happened. If it didn't, or if checking resulted in another failure response, return a failure of some type. """ f.trap(APIError) if f.value.code == 400: return f d = deferLater(clock, create_failure_delay, find_server, server_endpoint, auth_token, server_config, log=log) d.addBoth(_check_results, f) return d def _create_with_delay(to_delay): d = _treq.post(path, headers=headers(auth_token), data=json.dumps({'server': server_config}), log=log) if to_delay: # Add 1 second delay to space 1 second between server creations d.addCallback(delay, clock, 1) return d def _create_server(): """ Attempt to create a server, handling spurious non-400 errors from Nova by seeing if Nova created a server anyway in spite of the error. If so then create server succeeded. If not, and if no further errors occur, server creation can be retried. """ sem = get_sempahore("create_server", "worker.create_server_limit") if sem is not None: d = sem.run(_create_with_delay, True) else: d = _create_with_delay(False) d.addCallback(check_success, [202], _treq=_treq) d.addCallback(_treq.json_content) d.addErrback(_check_server_created) return d def _unwrap_NoCreatedServerFound(f): """ The original failure was wrapped in a :class:`_NoCreatedServerFound` for ease of retry, but that should not be the final error propagated up by :func:`create_server`. This errback unwraps the :class:`_NoCreatedServerFound` error and returns the original failure. """ f.trap(_NoCreatedServerFound) return f.value.original d = retry(_create_server, can_retry=compose_retries( retry_times(retries), terminal_errors_except(_NoCreatedServerFound)), next_interval=repeating_interval(15), clock=clock) d.addErrback(_unwrap_NoCreatedServerFound) d.addErrback(wrap_request_error, path, 'server_create') return d
def launch_server(log, request_bag, scaling_group, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param request_bag: An object with a bunch of useful data on it, including a callable to re-auth and get a new token. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) cloudServersOpenStack = config_value('cloudServersOpenStack') server_endpoint = public_endpoint_url(request_bag.service_catalog, cloudServersOpenStack, request_bag.region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def check_metadata(server): # sanity check to make sure the metadata didn't change - can probably # be removed after a while if we do not see any log messages from this # function expected = launch_config['server']['metadata'] result = server['server'].get('metadata') if result != expected: ilog[0].msg('Server metadata has changed.', sanity_check=True, expected_metadata=expected, nova_metadata=result) return server def wait_for_server(server, new_request_bag): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is # fine since verified_delete succeeds on deleted server undo.push( verified_delete, log, server_endpoint, new_request_bag, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active( ilog[0], server_endpoint, new_request_bag.auth_token, server_id).addCallback(check_metadata) def add_lb(server, new_request_bag): if lb_config: lbd = add_to_load_balancers( ilog[0], new_request_bag, lb_config, server, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd return (server, []) def _real_create_server(new_request_bag): auth_token = new_request_bag.auth_token d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server, new_request_bag) d.addCallback(add_lb, new_request_bag) return d def _create_server(): return request_bag.re_auth().addCallback(_real_create_server) def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg('{server_id} errored, deleting and creating new ' 'server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, request_bag, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def _retry(eff): """Retry an effect with a common policy.""" return retry_effect( eff, retry_times(5), exponential_backoff_interval(2))
def launch_server(log, region, scaling_group, service_catalog, auth_token, launch_config, undo, clock=None): """ Launch a new server given the launch config auth tokens and service catalog. Possibly adding the newly launched server to a load balancer. :param BoundLog log: A bound logger. :param str region: A rackspace region as found in the service catalog. :param IScalingGroup scaling_group: The scaling group to add the launched server to. :param list service_catalog: A list of services as returned by the auth apis. :param str auth_token: The user's auth token. :param dict launch_config: A launch_config args structure as defined for the launch_server_v1 type. :param IUndoStack undo: The stack that will be rewound if undo fails. :return: Deferred that fires with a 2-tuple of server details and the list of load balancer responses from add_to_load_balancers. """ launch_config = prepare_launch_config(scaling_group.uuid, launch_config) lb_region = config_value('regionOverrides.cloudLoadBalancers') or region cloudLoadBalancers = config_value('cloudLoadBalancers') cloudServersOpenStack = config_value('cloudServersOpenStack') lb_endpoint = public_endpoint_url(service_catalog, cloudLoadBalancers, lb_region) server_endpoint = public_endpoint_url(service_catalog, cloudServersOpenStack, region) lb_config = launch_config.get('loadBalancers', []) server_config = launch_config['server'] log = log.bind(server_name=server_config['name']) ilog = [None] def wait_for_server(server): server_id = server['server']['id'] # NOTE: If server create is retried, each server delete will be pushed # to undo stack even after it will be deleted in check_error which is fine # since verified_delete succeeds on deleted server undo.push( verified_delete, log, server_endpoint, auth_token, server_id) ilog[0] = log.bind(server_id=server_id) return wait_for_active( ilog[0], server_endpoint, auth_token, server_id) def add_lb(server): ip_address = private_ip_addresses(server)[0] lbd = add_to_load_balancers( ilog[0], lb_endpoint, auth_token, lb_config, ip_address, undo) lbd.addCallback(lambda lb_response: (server, lb_response)) return lbd def _create_server(): d = create_server(server_endpoint, auth_token, server_config, log=log) d.addCallback(wait_for_server) d.addCallback(add_lb) return d def check_error(f): f.trap(UnexpectedServerStatus) if f.value.status == 'ERROR': log.msg('{server_id} errored, deleting and creating new server instead', server_id=f.value.server_id) # trigger server delete and return True to allow retry verified_delete(log, server_endpoint, auth_token, f.value.server_id) return True else: return False d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error), next_interval=repeating_interval(15), clock=clock) return d
def create_server(server_endpoint, auth_token, server_config, log=None, clock=None, retries=3, create_failure_delay=5, _treq=None): """ Create a new server. If there is an error from Nova from this call, checks to see if the server was created anyway. If not, will retry the create ``retries`` times (checking each time if a server). If the error from Nova is a 400, does not retry, because that implies that retrying will just result in another 400 (bad args). If checking to see if the server is created also results in a failure, does not retry because there might just be something wrong with Nova. :param str server_endpoint: Server endpoint URI. :param str auth_token: Keystone Auth Token. :param dict server_config: Nova server config. :param: int retries: Number of tries to retry the create. :param: int create_failure_delay: how much time in seconds to wait after a create server failure before checking Nova to see if a server was created :param log: logger :type log: :class:`otter.log.bound.BoundLog` :param _treq: To be used for testing - what treq object to use :type treq: something with the same api as :obj:`treq` :return: Deferred that fires with the CreateServer response as a dict. """ path = append_segments(server_endpoint, 'servers') if _treq is None: # pragma: no cover _treq = treq if clock is None: # pragma: no cover from twisted.internet import reactor clock = reactor def _check_results(result, propagated_f): """ Return the original failure, if checking a server resulted in a failure too. Returns a wrapped propagated failure, if there were no servers created, so that the retry utility knows that server creation can be retried. """ if isinstance(result, Failure): log.msg("Attempt to find a created server in nova resulted in " "{failure}. Propagating the original create error instead.", failure=result) return propagated_f if result is None: raise _NoCreatedServerFound(propagated_f) return result def _check_server_created(f): """ If creating a server failed with anything other than a 400, see if Nova created a server anyway (a 400 means that the server creation args were bad, and there is no point in retrying). If Nova created a server, just return it and pretend that the error never happened. If it didn't, or if checking resulted in another failure response, return a failure of some type. """ f.trap(APIError) if f.value.code == 400: return f d = deferLater(clock, create_failure_delay, find_server, server_endpoint, auth_token, server_config, log=log) d.addBoth(_check_results, f) return d def _create_with_delay(to_delay): d = _treq.post(path, headers=headers(auth_token), data=json.dumps({'server': server_config}), log=log) if to_delay: # Add 1 second delay to space 1 second between server creations d.addCallback(delay, clock, 1) return d def _create_server(): """ Attempt to create a server, handling spurious non-400 errors from Nova by seeing if Nova created a server anyway in spite of the error. If so then create server succeeded. If not, and if no further errors occur, server creation can be retried. """ sem = get_sempahore("create_server", "worker.create_server_limit") if sem is not None: d = sem.run(_create_with_delay, True) else: d = _create_with_delay(False) d.addCallback(check_success, [202], _treq=_treq) d.addCallback(_treq.json_content) d.addErrback(_check_server_created) return d def _unwrap_NoCreatedServerFound(f): """ The original failure was wrapped in a :class:`_NoCreatedServerFound` for ease of retry, but that should not be the final error propagated up by :func:`create_server`. This errback unwraps the :class:`_NoCreatedServerFound` error and returns the original failure. """ f.trap(_NoCreatedServerFound) return f.value.original d = retry( _create_server, can_retry=compose_retries( retry_times(retries), terminal_errors_except(_NoCreatedServerFound)), next_interval=repeating_interval(15), clock=clock) d.addErrback(_unwrap_NoCreatedServerFound) d.addErrback(wrap_request_error, path, 'server_create') return d
def _retry(eff): """Retry an effect with a common policy.""" return retry_effect(eff, retry_times(5), exponential_backoff_interval(2))