def wait(self, environment): endpoint = self.endpoint().resolve() while endpoint.endswith("/"): endpoint = endpoint[:-1] while endpoint.endswith("."): endpoint = endpoint[:-1] while self.check_url.startswith("/"): self.check_url = self.check_url[1:] url = endpoint + '/' + self.check_url expected = self.expect.format(**environment) log.info("Asking server for version till we match %s", expected) for _ in hp.until(self.timeout_after, step=15): log.info("Asking %s", url) try: result = requests.get(url).text except requests.exceptions.ConnectionError as error: log.warning("Failed to ask server\terror=%s", error) else: log.info("\tgot back %s", result) if fnmatch.fnmatch(result, expected): log.info("Deployment successful!") return raise BadStack("Timedout waiting for the app to give back the correct version")
def get_all_deployment_messages(self, sqs_url, timeout=60, sleep=2): """ Get all the messages of the queue, dropping those that are not deployment messages We keep getting messages whilst the count is greater than 0 and we don't have messages yet. We will eventually timeout and return what we have if we keep getting invalid messages or keep getting no messages. """ messages = [] q = self.conn.get_queue(sqs_url) for _ in until(timeout, step=sleep): while q.count() > 0: raw_messages = self.conn.receive_message(q, number_messages=1) for raw_message in raw_messages: encoded_message = json.loads(raw_message.get_body())["Message"] q.delete_message(raw_message) try: messages.append(Message.decode(encoded_message)) except BadSQSMessage as error: log.error("Failed to parse a message: %s", error) if messages: break return messages
def wait(self, timeout=1200, rollback_is_failure=False, may_not_exist=True): status = self.status if not status.exists and may_not_exist: return status last = datetime.datetime.utcnow() if status.failed: raise BadStack("Stack is in a failed state, it must be deleted first", name=self.stack_name, status=status) for _ in hp.until(timeout, step=15): if status.exists and status.complete: break log.info("Waiting for %s - %s", self.stack_name, status.name) if status.exists and not status.complete: status = self.status else: break description = self.description() events = description.describe_events() next_last = events[0].timestamp for event in events: if event.timestamp > last: reason = event.resource_status_reason or "" log.info("%s - %s %s (%s) %s", self.stack_name, event.resource_type, event.logical_resource_id, event.resource_status, reason) last = next_last status = self.status if status.failed or (rollback_is_failure and status.is_rollback) or not status.complete: raise BadStack("Stack failed to complete", final_status=status) return status
def get_all_deployment_messages(self, sqs_url, timeout=60, sleep=2): """ Get all the messages of the queue, dropping those that are not deployment messages We keep getting messages whilst the count is greater than 0 and we don't have messages yet. We will eventually timeout and return what we have if we keep getting invalid messages or keep getting no messages. """ messages = [] q = self.conn.get_queue(sqs_url) for _ in until(timeout, step=sleep): while q.count() > 0: raw_messages = self.conn.receive_message(q, number_messages=1) for raw_message in raw_messages: encoded_message = json.loads( raw_message.get_body())['Message'] q.delete_message(raw_message) try: messages.append(Message.decode(encoded_message)) except BadSQSMessage as error: log.error("Failed to parse a message: %s", error) if messages: break return messages
def wait(self, environment): endpoint = self.endpoint().resolve() while endpoint.endswith("/"): endpoint = endpoint[:-1] while endpoint.endswith("."): endpoint = endpoint[:-1] while self.check_url.startswith("/"): self.check_url = self.check_url[1:] url = endpoint + '/' + self.check_url expected = self.expect.format(**environment) log.info("Asking server for version till we match %s", expected) for _ in hp.until(self.timeout_after, step=15): log.info("Asking %s", url) try: res = requests.get(url) result = res.text status = res.status_code except requests.exceptions.ConnectionError as error: log.warning("Failed to ask server\terror=%s", error) else: log.info("\tgot back (%s) '%s'", status, result) if fnmatch.fnmatch(result, expected): log.info("Deployment successful!") return raise BadStack( "Timedout waiting for the app to give back the correct version")
def wait(self, instances, environment, sqs): version_message = self.version_message.format(**environment) deployment_queue = self.deployment_queue.format(**environment) failed = [] success = [] attempt = 0 log.info("Checking sqs for %s", version_message) log.info("Checking for message for instances [%s]", ",".join(instances)) for _ in hp.until(timeout=self.timeout, step=5, action="Checking for valid deployment actions"): messages = sqs.get_all_deployment_messages(deployment_queue) # Look for success and failure in the messages for message in messages: log.info("Message received for instance %s with content [%s]", message.instance_id, message.output) # Ignore the messages for instances outside this deployment if message.instance_id in instances: if fnmatch.fnmatch(message.output, version_message): log.info("Deployed instance %s", message.instance_id) success.append(message.instance_id) else: log.info("Failed to deploy instance %s", message.instance_id) log.info("Failure Message: %s", message.output) failed.append(message.instance_id) # Stop trying if we have all the instances if set(failed + success) == set(instances): break # Record the iteration of checking for a valid deployment attempt += 1 log.info( "Completed attempt %s of checking for a valid deployment state", attempt) if success: log.info("Succeeded to deploy %s", success) if failed: log.error("Failed to deploy %s", failed) raise BadDeployment(failed=failed) if not success and not failed: log.error("Failed to receive any messages") raise BadDeployment("Failed to receive any messages") log.info( "All instances have been confirmed to be deployed with version_message [%s]!", version_message)
def wait(self, timeout=1200, rollback_is_failure=False, may_not_exist=True): status = self.status if not status.exists and may_not_exist: return status last = datetime.datetime.now(pytz.utc) if status.failed: raise BadStack( "Stack is in a failed state, it must be deleted first", name=self.stack_name, status=status) for _ in hp.until(timeout, step=15): if status.exists and status.complete: break log.info("Waiting for %s - %s", self.stack_name, status.name) if status.exists and not status.complete: status = self.status else: break description = self.description() events = [] while True: try: with self.ignore_throttling_error(): response = self.conn.describe_stack_events( StackName=self.stack_name) events = response['StackEvents'] break except Throttled: log.info("Was throttled, waiting a bit") time.sleep(1) next_last = events[0]['Timestamp'] for event in events: if event['Timestamp'] > last: reason = event.get('ResourceStatusReason', '') log.info("%s - %s %s (%s) %s", self.stack_name, event['ResourceType'], event['LogicalResourceId'], event['ResourceStatus'], reason) last = next_last status = self.status if status.failed or (rollback_is_failure and status.is_rollback) or not status.complete: raise BadStack("Stack failed to complete", final_status=status) return status
def wait_for_dns_switch(collector, stack, artifact, site=NotSpecified, **kwargs): """Periodically check dns until all our sites point to where they should be pointing to for specified environment""" if stack.dns is NotSpecified: raise BespinError("No dns options are specified!") if site is NotSpecified and artifact not in ("", None, NotSpecified): site = artifact if site is NotSpecified or not site: site = None all_sites = stack.dns.sites() available = list(all_sites.keys()) if site: if site not in available: raise BespinError("Have no dns options for specified site", available=available, wanted=site) sites = [site] else: sites = available sites = [all_sites[s] for s in sites] environment = collector.configuration["bespin"].environment errors = [] for site in sorted(sites): if environment not in site.environments: errors.append(BespinError("Site doesn't have specified environment", site=site.name, wanted=environment, available=list(stack.environments.keys()))) try: rtype, rdata = site.current_value if rtype != site.record_type: errors.append(BespinError("Site is a different record type!", recorded_as=rtype, wanted=site.record_type)) log.info("%s is currently %s (%s)", site.domain, rdata, rtype) except BespinError as error: errors.append(error) continue if errors: raise BespinError("Prechecks failed", _errors=errors) log.info("Waiting for traffic to switch to %s\tsites=%s", environment, [site.domain for site in sites]) for _ in hp.until(timeout=600, step=5): if all(site.switched_to(environment) for site in sites): log.info("Finished switching!") break else: log.info("Waiting for sites to switch")
def wait(self, instances, environment, sqs): version_message = self.version_message.format(**environment) deployment_queue = self.deployment_queue.format(**environment) failed = [] success = [] attempt = 0 log.info("Checking sqs for %s", version_message) log.info("Checking for message for instances [%s]", ",".join(instances)) for _ in hp.until(timeout=self.timeout, step=5, action="Checking for valid deployment actions"): messages = sqs.get_all_deployment_messages(deployment_queue) # Look for success and failure in the messages for message in messages: log.info("Message received for instance %s with content [%s]", message.instance_id, message.output) # Ignore the messages for instances outside this deployment if message.instance_id in instances: if fnmatch.fnmatch(message.output, version_message): log.info("Deployed instance %s", message.instance_id) success.append(message.instance_id) else: log.info("Failed to deploy instance %s", message.instance_id) log.info("Failure Message: %s", message.output) failed.append(message.instance_id) # Stop trying if we have all the instances if set(failed + success) == set(instances): break # Record the iteration of checking for a valid deployment attempt += 1 log.info("Completed attempt %s of checking for a valid deployment state", attempt) if success: log.info("Succeeded to deploy %s", success) if failed: log.error("Failed to deploy %s", failed) raise BadDeployment(failed=failed) if not success and not failed: log.error("Failed to receive any messages") raise BadDeployment("Failed to receive any messages") log.info("All instances have been confirmed to be deployed with version_message [%s]!", version_message)
def wait_for(self, bucket, key, timeout, start=None): if start is None: start = datetime.utcnow() log.info("Looking for key with last_modified greater than %s", start) for _ in hp.until(timeout=timeout, step=5): try: bucket_obj = self.get_bucket(bucket) except BadS3Bucket as error: log.error(error) continue if key == '/': log.info("The bucket exists! and that is all we are looking for") return k = Key(bucket_obj) k.key = key try: k.read() except boto.exception.S3ResponseError as error: if error.status == 404: log.info("Key doesn't exist yet\tbucket=%s\tkey=%s", bucket_obj.name, key) continue else: log.error(error) continue last_modified = k.last_modified log.info("Found key in the bucket\tbucket=%s\tkey=%s\tlast_modified=%s", bucket_obj.name, key, last_modified) date = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S GMT") if date > start: log.info("Found key and it's newer than our start time!") return else: log.info("Found key but it's older than our start time, hasn't been updated yet") raise BespinError("Couldn't find the s3 key with a newer last modified")
def run(self): jb = None defaults = config.load_default_settings() original_paramiko_agent = paramiko.Agent with hp.a_temp_file() as fle: if self.proxy and self.proxy_ssh_key: fle.write("keyfile|{0}|{1}\n".format(self.proxy, self.proxy_ssh_key).encode('utf-8')) if self.ssh_key: fle.write("keyfile|*|{0}\n".format(self.ssh_key).encode('utf-8')) fle.close() auth_file = fle.name if (self.ssh_key or self.proxy_ssh_key) else None if self.proxy: jb = plugins.load_plugin(jumpbox.__file__) jb.init(auth=AuthManager(self.proxy_ssh_user, auth_file=auth_file), defaults=defaults) login = AuthManager(self.ssh_user, auth_file=auth_file, include_agent=True) keys = {} for key in login.agent_connection.get_keys(): ident = str(uuid.uuid1()) identity = type("Identity", (object, ), { "__str__": lambda s: ident , "get_name": lambda s: key.get_name() , "asbytes": lambda s: key.asbytes() , "sign_ssh_data": lambda s, *args, **kwargs: key.sign_ssh_data(*args, **kwargs) })() keys[identity] = key login.deferred_keys[identity] = key try: outputs = defaultdict(lambda: {"stdout": [], "stderr": []}) class TwoQueue(object): def __init__(self): self.q = queue.Queue(300) def put(self, thing): (host, is_stderr), line = thing outputs[host][["stdout", "stderr"][is_stderr]].append(line) self.q.put(thing) def __getattr__(self, key): if key in ("q", "put"): return object.__getattribute__(self, key) else: return getattr(self.q, key) console = RadSSHConsole(q=TwoQueue()) connections = [(ip, None) for ip in self.ips] if jb: jb.add_jumpbox(self.proxy) connections = list((ip, socket) for _, ip, socket in jb.do_jumpbox_connections(self.proxy, self.ips)) cluster = None try: log.info("Connecting") authenticated = False for _ in hp.until(timeout=120): if authenticated: break cluster = Cluster(connections, login, console=console, defaults=defaults) for _ in hp.until(timeout=10, step=0.5): if not any(cluster.pending): break if cluster.pending: raise BespinError("Timedout waiting to connect to some hosts", waiting_for=cluster.pending.keys()) for _ in hp.until(timeout=10, step=0.5): connections = list(cluster.connections.values()) if any(isinstance(connection, socket.gaierror) for connection in connections): raise BespinError("Some connections failed!", failures=[conn for conn in connections if isinstance(conn, socket.gaierror)]) if all(conn.authenticated for conn in connections): break authenticated = all(conn.authenticated for conn in cluster.connections.values()) if not authenticated: unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] log.info("Failed to authenticate will try to reconnect in 5 seconds\tunauthenticate=%s", unauthenticated) time.sleep(5) # Try to reauth if not authenticated yet unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] if unauthenticated: for host in unauthenticated: print('{0:14s} : {1}'.format(str(host), cluster.connections[host])) raise BespinError("Timedout waiting to authenticate all the hosts, do you have an ssh-agent running?", unauthenticated=unauthenticated) failed = [] for host, status in cluster.connections.items(): print('{0:14s} : {1}'.format(str(host), status)) if type(status) is socket.gaierror: failed.append(host) if failed: raise BespinError("Failed to connect to some hosts", failed=failed) cluster.run_command(self.command) error = False for host, job in cluster.last_result.items(): if not job.completed or job.result.return_code not in self.acceptable_return_codes: log.error('%s -%s', host, cluster.connections[host]) log.error('%s, %s', job, job.result.status) error = True if error: raise BespinError("Failed to run the commands") return outputs finally: if cluster: cluster.close_connections() finally: paramiko.Agent = original_paramiko_agent
describe BespinCase, "until": @contextmanager def mock_log_and_time(self): """Mock out the log object and time, yield (log, time)""" fake_log = mock.Mock(name="log") fake_time = mock.Mock(name="time") with mock.patch("bespin.helpers.log", fake_log): with mock.patch("bespin.helpers.time", fake_time): yield (fake_log, fake_time) it "yields before doing anything else": done = [] with self.mock_log_and_time() as (fake_log, fake_time): for _ in until(): done.append(1) break self.assertEqual(len(fake_time.time.mock_calls), 0) self.assertEqual(len(fake_log.info.mock_calls), 0) self.assertEqual(done, [1]) it "logs the action each time": done = [] action = mock.Mock(name="action") with self.mock_log_and_time() as (fake_log, fake_time): def timer(): if not done: return 10 else:
def run(self): jb = None defaults = config.load_default_settings() defaults['hostkey.verify'] = 'ignore' original_paramiko_agent = paramiko.Agent with hp.a_temp_file() as fle: if self.proxy and self.proxy_ssh_key: fle.write("keyfile|{0}|{1}\n".format(self.proxy, self.proxy_ssh_key).encode('utf-8')) if self.ssh_key: fle.write("keyfile|*|{0}\n".format(self.ssh_key).encode('utf-8')) fle.close() auth_file = fle.name if (self.ssh_key or self.proxy_ssh_key) else None if self.proxy: jb = plugins.load_plugin(jumpbox.__file__) jb.init(auth=AuthManager(self.proxy_ssh_user, auth_file=auth_file), defaults=defaults) login = AuthManager(self.ssh_user, auth_file=auth_file, include_agent=True) keys = {} for key in login.agent_connection.get_keys(): ident = str(uuid.uuid1()) identity = type("Identity", (object, ), { "__str__": lambda s: ident , "get_name": lambda s: key.get_name() , "asbytes": lambda s: key.asbytes() , "sign_ssh_data": lambda s, *args, **kwargs: key.sign_ssh_data(*args, **kwargs) })() keys[identity] = key login.deferred_keys[identity] = key # Diry dirty hack # Waiting for https://github.com/radssh/radssh/pull/10 paramiko.Agent = type("AgentConnection", (object, ), {"get_keys": lambda *args: keys.keys()}) try: console = RadSSHConsole() connections = [(ip, None) for ip in self.ips] if jb: jb.add_jumpbox(self.proxy) connections = list((ip, socket) for _, ip, socket in jb.do_jumpbox_connections(self.proxy, self.ips)) cluster = None try: log.info("Connecting") authenticated = False for _ in hp.until(timeout=120): if authenticated: break cluster = Cluster(connections, login, console=console, defaults=defaults) for _ in hp.until(timeout=10, step=0.5): if not any(cluster.pending): break if cluster.pending: raise BespinError("Timedout waiting to connect to some hosts", waiting_for=cluster.pending.keys()) for _ in hp.until(timeout=10, step=0.5): if all(conn.authenticated for conn in cluster.connections.values()): break authenticated = all(conn.authenticated for conn in cluster.connections.values()) if not authenticated: unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] log.info("Failed to authenticate will try to reconnect in 5 seconds\tunauthenticate=%s", unauthenticated) time.sleep(5) # Try to reauth if not authenticated yet unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] if unauthenticated: for host in unauthenticated: print('{0:14s} : {1}'.format(str(host), cluster.connections[host])) raise BespinError("Timedout waiting to authenticate all the hosts, do you have an ssh-agent running?", unauthenticated=unauthenticated) failed = [] for host, status in cluster.connections.items(): print('{0:14s} : {1}'.format(str(host), status)) if type(status) is socket.gaierror: failed.append(host) if failed: raise BespinError("Failed to connect to some hosts", failed=failed) cluster.run_command(self.command) error = False for host, job in cluster.last_result.items(): if not job.completed or job.result.return_code not in self.acceptable_return_codes: log.error('%s -%s', host, cluster.connections[host]) log.error('%s, %s', job, job.result.status) error = True if error: raise BespinError("Failed to run the commands") finally: if cluster: cluster.close_connections() finally: paramiko.Agent = original_paramiko_agent
def run(self): jb = None defaults = config.load_default_settings() defaults['hostkey.verify'] = 'ignore' original_paramiko_agent = paramiko.Agent with hp.a_temp_file() as fle: if self.proxy and self.proxy_ssh_key: fle.write("keyfile|{0}|{1}\n".format(self.proxy, self.proxy_ssh_key).encode('utf-8')) if self.ssh_key: fle.write("keyfile|*|{0}\n".format(self.ssh_key).encode('utf-8')) fle.close() auth_file = fle.name if (self.ssh_key or self.proxy_ssh_key) else None if self.proxy: jb = plugins.load_plugin(jumpbox.__file__) jb.init(auth=AuthManager(self.proxy_ssh_user, auth_file=auth_file), defaults=defaults) login = AuthManager(self.ssh_user, auth_file=auth_file, include_agent=True) keys = {} for key in login.agent_connection.get_keys(): ident = str(uuid.uuid1()) identity = type("Identity", (object, ), { "__str__": lambda s: ident , "get_name": lambda s: key.get_name() , "asbytes": lambda s: key.asbytes() , "sign_ssh_data": lambda s, *args, **kwargs: key.sign_ssh_data(*args, **kwargs) })() keys[identity] = key login.deferred_keys[identity] = key try: outputs = defaultdict(lambda: {"stdout": [], "stderr": []}) class TwoQueue(object): def __init__(self): self.q = queue.Queue(300) def put(self, thing): (host, is_stderr), line = thing outputs[host][["stdout", "stderr"][is_stderr]].append(line) self.q.put(thing) def __getattr__(self, key): if key in ("q", "put"): return object.__getattribute__(self, key) else: return getattr(self.q, key) console = RadSSHConsole(q=TwoQueue()) connections = [(ip, None) for ip in self.ips] if jb: jb.add_jumpbox(self.proxy) connections = list((ip, socket) for _, ip, socket in jb.do_jumpbox_connections(self.proxy, self.ips)) cluster = None try: log.info("Connecting") authenticated = False for _ in hp.until(timeout=120): if authenticated: break cluster = Cluster(connections, login, console=console, defaults=defaults) for _ in hp.until(timeout=10, step=0.5): if not any(cluster.pending): break if cluster.pending: raise BespinError("Timedout waiting to connect to some hosts", waiting_for=cluster.pending.keys()) for _ in hp.until(timeout=10, step=0.5): connections = list(cluster.connections.values()) if any(isinstance(connection, socket.gaierror) for connection in connections): raise BespinError("Some connections failed!", failures=[conn for conn in connections if isinstance(conn, socket.gaierror)]) if all(conn.authenticated for conn in connections): break authenticated = all(conn.authenticated for conn in cluster.connections.values()) if not authenticated: unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] log.info("Failed to authenticate will try to reconnect in 5 seconds\tunauthenticate=%s", unauthenticated) time.sleep(5) # Try to reauth if not authenticated yet unauthenticated = [host for host, conn in cluster.connections.items() if not conn.authenticated] if unauthenticated: for host in unauthenticated: print('{0:14s} : {1}'.format(str(host), cluster.connections[host])) raise BespinError("Timedout waiting to authenticate all the hosts, do you have an ssh-agent running?", unauthenticated=unauthenticated) failed = [] for host, status in cluster.connections.items(): print('{0:14s} : {1}'.format(str(host), status)) if type(status) is socket.gaierror: failed.append(host) if failed: raise BespinError("Failed to connect to some hosts", failed=failed) cluster.run_command(self.command) error = False for host, job in cluster.last_result.items(): if not job.completed or job.result.return_code not in self.acceptable_return_codes: log.error('%s -%s', host, cluster.connections[host]) log.error('%s, %s', job, job.result.status) error = True if error: raise BespinError("Failed to run the commands") return outputs finally: if cluster: cluster.close_connections() finally: paramiko.Agent = original_paramiko_agent