def _instance_wait_safe(self, instance_method, *args, **kwargs): """ Wrapper around AWS instance waiters that is safer to use. Since AWS adopts an eventual consistency model, sometimes the method wait_until_running will raise a botocore.exceptions.WaiterError saying the instance does not exist. AWS API guide [1] recommends that the procedure is retried using an exponencial backoff algorithm [2]. :see: [1] http://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html#eventual-consistency :see: [2] http://docs.aws.amazon.com/general/latest/gr/api-retries.html """ threshold = 300 ok = False retries = 0 max_retries = 9 while not ok and retries <= max_retries: try: instance_method(*args, **kwargs) ok = True except WaiterError: time.sleep(min((2**retries) * 2, threshold)) retries += 1 if not ok: raise cluster.NodeError('AWS instance %s waiter error after ' 'exponencial backoff wait' % self._instance.id)
class GCENode(cluster.BaseNode): """ Wraps GCE instances, so that we can also control the instance through SSH. """ def __init__(self, gce_instance, gce_service, credentials, node_prefix='node', node_index=1, gce_image_username='******', base_logdir=None, dc_idx=0): name = '%s-%s-%s' % (node_prefix, dc_idx, node_index) self._instance = gce_instance self._gce_service = gce_service self._wait_public_ip() ssh_login_info = {'hostname': None, 'user': gce_image_username, 'key_file': credentials.key_file, 'extra_ssh_options': '-tt'} super(GCENode, self).__init__(name=name, ssh_login_info=ssh_login_info, base_logdir=base_logdir, node_prefix=node_prefix, dc_idx=dc_idx) if cluster.TEST_DURATION >= 24 * 60 or cluster.Setup.KEEP_ALIVE: self.log.info('Test duration set to %s. ' 'Keep cluster on failure %s. ' 'Tagging node with "keep-alive"', cluster.TEST_DURATION, cluster.Setup.KEEP_ALIVE) self._instance_wait_safe(self._gce_service.ex_set_node_tags, self._instance, ['keep-alive']) self._instance_wait_safe(self._gce_service.ex_set_node_metadata, self._instance, {'workspace': cluster.WORKSPACE, 'uname': ' | '.join(os.uname())}) def _instance_wait_safe(self, instance_method, *args, **kwargs): """ Wrapper around GCE instance methods that is safer to use. Let's try a method, and if it fails, let's retry using an exponential backoff algorithm, similar to what Amazon recommends for it's own service [1]. :see: [1] http://docs.aws.amazon.com/general/latest/gr/api-retries.html """ threshold = 300 ok = False retries = 0 max_retries = 9 while not ok and retries <= max_retries: try: return instance_method(*args, **kwargs) except Exception, details: self.log.error('Call to method %s (retries: %s) failed: %s', instance_method, retries, details) time.sleep(min((2 ** retries) * 2, threshold)) retries += 1 if not ok: raise cluster.NodeError('GCE instance %s method call error after ' 'exponential backoff wait' % self._instance.id)
def _instance_wait_safe(self, instance_method, *args, **kwargs): """ Wrapper around AWS instance waiters that is safer to use. Since AWS adopts an eventual consistency model, sometimes the method wait_until_running will raise a botocore.exceptions.WaiterError saying the instance does not exist. AWS API guide [1] recommends that the procedure is retried using an exponencial backoff algorithm [2]. :see: [1] http://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html#eventual-consistency :see: [2] http://docs.aws.amazon.com/general/latest/gr/api-retries.html """ threshold = 300 ok = False retries = 0 max_retries = 9 while not ok and retries <= max_retries: try: instance_method(*args, **kwargs) ok = True except WaiterError: time.sleep(min((2**retries) * 2, threshold)) retries += 1 if not ok: try: self._instance.reload() except Exception as e: self.log.exception( "Error while reloading instance metadata: %s", e) finally: method_name = instance_method.__name__ instance_id = self._instance.id self.log.debug(self._instance.meta.data) msg = "Timeout while running '{method_name}' method on AWS instance '{instance_id}'".format( **locals()) raise cluster.NodeError(msg)