def check_mesos_no_duplicate_frameworks(): master = get_mesos_master() try: state = master.state except MasterNotAvailableException as e: paasta_print("CRITICAL: %s" % e.message) sys.exit(2) marathon_clients = marathon_tools.get_list_of_marathon_clients() try: framework_ids = get_marathon_framework_ids(marathon_clients) except (MarathonError, ValueError) as e: paasta_print( "CRITICAL: Unable to contact Marathon cluster: {}".format(e)) sys.exit(2) result = assert_framework_count( state=state, marathon_framework_ids=framework_ids, ) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print("CRITICAL: %s" % result.message) sys.exit(2)
def main(hostnames): master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) slaves = [slave for slave in mesos_state.get('slaves', []) if slave['hostname'] in hostnames] tasks = get_all_tasks_from_state(mesos_state, include_orphans=True) filtered_tasks = filter_tasks_for_slaves(slaves, tasks) resource_info_dict = calculate_resource_utilization_for_slaves(slaves, filtered_tasks) resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) output = {} for metric in resource_utilizations: utilization = metric.total - metric.free if int(metric.total) == 0: utilization_perc = 100 else: utilization_perc = utilization / float(metric.total) * 100 output[metric.metric] = { 'total': metric.total, 'used': utilization, 'perc': utilization_perc, } print(json.dumps(output))
def scale_resource(self, current_capacity, target_capacity): """Scales an AWS resource based on current and target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param current_capacity: integer current resource capacity :param target_capacity: target resource capacity """ target_capacity = int(target_capacity) delta = target_capacity - current_capacity if delta == 0: self.log.info("Already at target capacity: {}".format(target_capacity)) return elif delta > 0: self.log.info("Increasing resource capacity to: {}".format(target_capacity)) self.set_capacity(target_capacity) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=self.resource['pool']) filtered_slaves = self.filter_aws_slaves(slaves_list) killable_capacity = sum([slave.instance_weight for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: self.log.error( "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!" ) return self.downscale_aws_resource( filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity)
def paasta_sysdig(args): if not args.local: mesos_master = get_any_mesos_master(cluster=args.cluster) ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format(mesos_master, ' '.join(sys.argv[1:])) return_code, output = _run(ssh_cmd) if return_code != 0: print output sys.exit(return_code) slave, command = output.split(':', 1) subprocess.call(shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip()))) return status = get_status_for_instance(cluster=args.cluster, service=args.service, instance=args.instance) slave = pick_slave_from_status(status=status, host=args.host) marathon_config = load_marathon_config() marathon_url = marathon_config.get_url()[0] marathon_user = marathon_config.get_username() marathon_pass = marathon_config.get_password() mesos_url = get_mesos_master().host marathon_parsed_url = urlparse(marathon_url) marathon_creds_url = marathon_parsed_url._replace(netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass, marathon_parsed_url.netloc)) print format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl())
def metrics_provider(self): if not self.asg: self.log.warning("ASG {} not found, removing config file".format( self.resource['id'])) self.cleanup_cancelled_config(self.resource['id'], self.config_folder, dry_run=self.dry_run) return 0, 0 if self.is_aws_launching_instances(): self.log.warning( "ASG still launching new instances so we won't make any" "changes this time.") return 0, 0 expected_instances = len(self.instances) if expected_instances == 0: self.log.warning( "This ASG has no instances, delta should be 1 to " "launch first instance unless max/min capacity override") return self.get_asg_delta(1) mesos_state = get_mesos_master().state slaves = self.get_aws_slaves(mesos_state) error = self.get_mesos_utilization_error( slaves=slaves, mesos_state=mesos_state, expected_instances=expected_instances) return self.get_asg_delta(error)
def unreserve_all_resources(hostnames): """Dynamically unreserve all available resources on the specified hosts :param hostnames: list of hostnames to unreserve resources on """ mesos_state = get_mesos_master().state_summary() components = hostnames_to_components(hostnames) hosts = components_to_hosts(components) known_slaves = [ slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts ] for slave in known_slaves: hostname = slave['hostname'] log.info("Unreserving all resources on %s" % hostname) slave_id = slave['id'] resources = [] for role in slave['reserved_resources']: for resource in ['disk', 'mem', 'cpus']: reserved_resource = slave['reserved_resources'][role][resource] resources.append( Resource(name=resource, amount=reserved_resource)) try: unreserve(slave_id=slave_id, resources=resources) except HTTPError: raise HTTPError( "Failed unreserving all of the resources on %s (%s). Aborting." % (hostname, slave_id))
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: break log.info("SFR slave kill preference: {0}".format( [slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info( "Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format( slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() filtered_slaves = get_mesos_task_count_by_slave( mesos_state, slaves_list=filtered_sorted_slaves)
def paasta_sysdig(args): if not args.local: mesos_master = get_any_mesos_master(cluster=args.cluster) ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format( mesos_master, ' '.join(sys.argv[1:])) return_code, output = _run(ssh_cmd) if return_code != 0: print output sys.exit(return_code) slave, command = output.split(':', 1) subprocess.call( shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip()))) return status = get_status_for_instance(cluster=args.cluster, service=args.service, instance=args.instance) slave = pick_slave_from_status(status=status, host=args.host) marathon_config = load_marathon_config() marathon_url = marathon_config.get_url()[0] marathon_user = marathon_config.get_username() marathon_pass = marathon_config.get_password() mesos_url = get_mesos_master().host marathon_parsed_url = urlparse(marathon_url) marathon_creds_url = marathon_parsed_url._replace( netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass, marathon_parsed_url.netloc)) print format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl())
def check_registration(threshold_percentage): mesos_state = get_mesos_master().state autoscaling_resources = load_system_paasta_config( ).get_cluster_autoscaling_resources() for resource in autoscaling_resources.values(): print("Checking %s" % resource['id']) try: scaler = get_scaler(resource['type'])(resource=resource, pool_settings=None, config_folder=None, dry_run=True) except KeyError: print("Couldn't find a metric provider for resource of type: {}". format(resource['type'])) continue if len(scaler.instances) == 0: print("No instances for this resource") continue else: slaves = scaler.get_aws_slaves(mesos_state) percent_registered = float( float(len(slaves)) / float(len(scaler.instances))) * 100 if percent_registered < float(threshold_percentage): print( "CRIT: Only found {}% of instances in {} registered in mesos. " "Please check for puppet or AMI baking problems!".format( percent_registered, resource['id'])) return False print( "OK: Found more than {}% of instances registered for all paasta resources in this " "superregion".format(threshold_percentage)) return True
def unreserve_all_resources(hostnames): """Dynamically unreserve all available resources on the specified hosts :param hostnames: list of hostnames to unreserve resources on """ mesos_state = a_sync.block(get_mesos_master().state_summary) components = hostnames_to_components(hostnames) hosts = components_to_hosts(components) known_slaves = [ slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts ] for slave in known_slaves: hostname = slave["hostname"] log.info("Unreserving all resources on %s" % hostname) slave_id = slave["id"] resources = [] if MAINTENANCE_ROLE in slave["reserved_resources"]: for resource in ["disk", "mem", "cpus", "gpus"]: reserved_resource = slave["reserved_resources"][ MAINTENANCE_ROLE][resource] resources.append( Resource(name=resource, amount=reserved_resource)) try: unreserve(slave_id=slave_id, resources=resources) except HTTPError: raise HTTPError( f"Failed unreserving all of the resources on {hostname} ({slave_id}). Aborting." )
def main(hostnames: Sequence[str]) -> None: master = get_mesos_master() try: mesos_state = block(master.state) except MasterNotAvailableException as e: print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) slaves = [ slave for slave in mesos_state.get("slaves", []) if slave["hostname"] in hostnames ] tasks = get_all_tasks_from_state(mesos_state, include_orphans=True) filtered_tasks = filter_tasks_for_slaves(slaves, tasks) resource_info_dict = calculate_resource_utilization_for_slaves( slaves, filtered_tasks) resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"]) output = {} for metric in resource_utilizations: utilization = metric.total - metric.free if int(metric.total) == 0: utilization_perc = 100 else: utilization_perc = utilization / float(metric.total) * 100 output[metric.metric] = { "total": metric.total, "used": utilization, "perc": utilization_perc, } print(json.dumps(output))
def metrics_provider(self): if not self.sfr or self.sfr['SpotFleetRequestState'] == 'cancelled': self.log.error("SFR not found, removing config file.".format( self.resource['id'])) self.cleanup_cancelled_config(self.resource['id'], self.config_folder, dry_run=self.dry_run) return 0, 0 elif self.sfr['SpotFleetRequestState'] in [ 'cancelled_running', 'active' ]: expected_instances = len(self.instances) if expected_instances == 0: self.log.warning( "No instances found in SFR, this shouldn't be possible so we " "do nothing") return 0, 0 mesos_state = get_mesos_master().state slaves = self.get_aws_slaves(mesos_state) error = self.get_mesos_utilization_error( slaves=slaves, mesos_state=mesos_state, expected_instances=expected_instances) elif self.sfr['SpotFleetRequestState'] in [ 'submitted', 'modifying', 'cancelled_terminating' ]: self.log.warning( "Not scaling an SFR in state: {} so {}, skipping...".format( self.sfr['SpotFleetRequestState'], self.resource['id'])) return 0, 0 else: self.log.error("Unexpected SFR state: {} for {}".format( self.sfr['SpotFleetRequestState'], self.resource['id'])) raise ClusterAutoscalingError if self.is_aws_launching_instances( ) and self.sfr['SpotFleetRequestState'] == 'active': self.log.warning( "AWS hasn't reached the TargetCapacity that is currently set. We won't make any " "changes this time as we should wait for AWS to launch more instances first." ) return 0, 0 current, target = self.get_spot_fleet_delta(error) if self.sfr['SpotFleetRequestState'] == 'cancelled_running': self.resource['min_capacity'] = 0 slaves = self.get_pool_slaves(mesos_state) pool_error = self.get_mesos_utilization_error( slaves=slaves, mesos_state=mesos_state) if pool_error > 0: self.log.info( "Not scaling cancelled SFR %s because we are under provisioned" % (self.resource['id'])) return 0, 0 current, target = self.get_spot_fleet_delta(-1) if target == 1: target = 0 return current, target
def _clean_up_paasta_native_frameworks(context): clear_mesos_tools_cache() # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where # to connect to clean up paasta native frameworks. if hasattr(context, 'etc_paasta'): for framework in mesos_tools.get_mesos_master().frameworks(active_only=True): if framework.name.startswith('paasta '): paasta_print("cleaning up framework %s" % framework.name) try: mesos_tools.terminate_framework(framework.id) except requests.exceptions.HTTPError as e: paasta_print("Got exception when terminating framework %s: %s" % (framework.id, e))
def remote_run_stop(args): _, service, cluster, _, instance, _ = extract_args(args) if args.framework_id is None and args.run_id is None: paasta_print( PaastaColors.red( "Must provide either run id or framework id to stop.")) emit_counter_metric('paasta.remote_run.stop.failed', service, instance) sys.exit(1) frameworks = [ f for f in get_all_frameworks(active_only=True) if re.search(f'^paasta-remote {service}.{instance}', f.name) ] framework_id = args.framework_id if framework_id is None: if re.match('\s', args.run_id): paasta_print( PaastaColors.red("Run id must not contain whitespace.")) emit_counter_metric('paasta.remote_run.stop.failed', service, instance) sys.exit(1) found = [ f for f in frameworks if re.search(' %s$' % args.run_id, f.name) is not None ] if len(found) > 0: framework_id = found[0].id else: paasta_print( PaastaColors.red("Framework with run id %s not found." % args.run_id)) emit_counter_metric('paasta.remote_run.stop.failed', service, instance) sys.exit(1) else: found = [f for f in frameworks if f.id == framework_id] if len(found) == 0: paasta_print( PaastaColors.red( "Framework id %s does not match any %s.%s remote-run. Check status to find the correct id." % (framework_id, service, instance), ), ) emit_counter_metric('paasta.remote_run.stop.failed', service, instance) sys.exit(1) paasta_print("Tearing down framework %s." % framework_id) mesos_master = get_mesos_master() teardown = mesos_master.teardown(framework_id) if teardown.status_code == 200: paasta_print(PaastaColors.green("OK")) else: paasta_print(teardown.text)
def paasta_sysdig(args): system_paasta_config = load_system_paasta_config() if not args.local: mesos_master = get_any_mesos_master( cluster=args.cluster, system_paasta_config=system_paasta_config) ssh_cmd = ('ssh -At -o StrictHostKeyChecking=no -o LogLevel=QUIET {0} ' '"sudo paasta {1} --local"').format(mesos_master, ' '.join(sys.argv[1:])) return_code, output = _run(ssh_cmd) if return_code != 0: paasta_print(output) sys.exit(return_code) slave, command = output.split(':', 1) subprocess.call( shlex.split("ssh -tA {} '{}'".format(slave, command.strip()))) return status = get_status_for_instance( cluster=args.cluster, service=args.service, instance=args.instance, ) slave = pick_slave_from_status( status=status, host=args.host, ) job_config = load_marathon_service_config( service=args.service, instance=args.instance, cluster=args.cluster, ) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) # Unfortunately, sysdig seems to only be able to take one marathon URL, so hopefully the service in question is not # currently moving between shards. client = marathon_clients.get_current_client_for_service( job_config=job_config, ) marathon_url = client.servers[0] marathon_user, marathon_pass = client.auth mesos_url = get_mesos_master().host marathon_parsed_url = urlparse(marathon_url) marathon_creds_url = marathon_parsed_url._replace(netloc="{}:{}@{}".format( marathon_user, marathon_pass, marathon_parsed_url.netloc, )) paasta_print( format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl()))
def downscale_aws_resource(self, filtered_slaves, current_capacity, target_capacity): killed_slaves = 0 while True: filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness( filtered_slaves)[::-1] if len(filtered_sorted_slaves) == 0: self.log.info( "ALL slaves killed so moving on to next resource!") break self.log.info("Resource slave kill preference: {}".format( [slave.hostname for slave in filtered_sorted_slaves])) slave_to_kill = filtered_sorted_slaves.pop(0) instance_capacity = slave_to_kill.instance_weight new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: self.log.info( "Terminating instance {} with weight {} would take us below our target of {}," " so this is as close to our target as we can get".format( slave_to_kill.instance_id, slave_to_kill.instance_weight, target_capacity)) if self.resource[ 'type'] == 'aws_spot_fleet_request' and killed_slaves == 0: self.log.info( "This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually") else: break try: self.gracefully_terminate_slave( slave_to_kill=slave_to_kill, current_capacity=current_capacity, new_capacity=new_capacity) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetResourceCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: task_counts = get_mesos_task_count_by_slave( mesos_state, slaves_list=[{ 'task_counts': slave.task_counts } for slave in filtered_sorted_slaves]) for i, slave in enumerate(filtered_sorted_slaves): slave.task_counts = task_counts[i]['task_counts'] filtered_slaves = filtered_sorted_slaves
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings): mesos_state = get_mesos_master().state sfr = get_sfr(spotfleet_request_id, region=resource['region']) if not sfr or not sfr['SpotFleetRequestState'] == 'active': log.error( "Ignoring SFR {0} that does not exist or is not active.".format( spotfleet_request_id)) return 0, 0 sfr['ActiveInstances'] = get_spot_fleet_instances( spotfleet_request_id, region=resource['region']) resource['sfr'] = sfr desired_instances = len(sfr['ActiveInstances']) instance_ips = get_sfr_instance_ips(sfr, region=resource['region']) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } current_instances = len(slaves) log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % (float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state)[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION) error = utilization - target_utilization current, target = get_spot_fleet_delta(resource, error) return current, target
def check_registration(threshold_percentage): try: mesos_state = block(get_mesos_master().state) except MasterNotAvailableException as e: print("Could not find Mesos Master: %s" % e.message) sys.exit(1) config = load_system_paasta_config() autoscaling_resources = config.get_cluster_autoscaling_resources() for resource in autoscaling_resources.values(): print("Checking %s" % resource["id"]) try: scaler = get_scaler(resource["type"])( resource=resource, pool_settings=None, config_folder=None, dry_run=True, utilization_error=0.0, max_increase=0.0, max_decrease=0.0, ) except KeyError: print("Couldn't find a metric provider for resource of type: {}". format(resource["type"])) continue if len(scaler.instances) == 0: print("No instances for this resource") continue elif scaler.is_new_autoscaling_resource(): # See OPS-13784 threshold = config.get_monitoring_config().get( "check_registered_slave_threshold") print(f"Autoscaling resource was created within last {threshold}" " seconds and would probably fail this check") continue else: slaves = scaler.get_aws_slaves(mesos_state) percent_registered = ( float(float(len(slaves)) / float(len(scaler.instances))) * 100) if percent_registered < float(threshold_percentage): print( "CRIT: Only found {}% of instances in {} registered in mesos. " "Please check for puppet or AMI baking problems!".format( percent_registered, resource["id"])) return False print( "OK: Found more than {}% of instances registered for all paasta resources in this " "superregion".format(threshold_percentage)) return True
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, pool_settings, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param pool_settings: pool settings dict with timeout settings :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) current_capacity = int(current_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info( "Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region']) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=resource['pool']) filtered_slaves = filter_sfr_slaves(slaves_list, resource) killable_capacity = sum( [slave['instance_weight'] for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error( "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!" ) return downscale_spot_fleet_request(resource=resource, filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity, pool_settings=pool_settings, dry_run=dry_run)
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings, config_folder, dry_run=False): sfr = get_sfr(spotfleet_request_id, region=resource['region']) if not sfr or sfr['SpotFleetRequestState'] == 'cancelled': log.error("SFR not found, removing config file.".format(spotfleet_request_id)) cleanup_cancelled_sfr_config(spotfleet_request_id, config_folder, dry_run=dry_run) return 0, 0 elif sfr['SpotFleetRequestState'] in ['cancelled_running', 'active']: sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id, region=resource['region']) resource['sfr'] = sfr desired_instances = len(sfr['ActiveInstances']) mesos_state = get_mesos_master().state slaves = get_sfr_slaves(resource, mesos_state) error = get_mesos_utilization_error(spotfleet_request_id, resource=resource, pool_settings=pool_settings, slaves=slaves, mesos_state=mesos_state, desired_instances=desired_instances) elif sfr['SpotFleetRequestState'] in ['submitted', 'modifying', 'cancelled_terminating']: log.warning("Not scaling an SFR in state: {0} so {1}, skipping...".format(sfr['SpotFleetRequestState'], spotfleet_request_id)) return 0, 0 else: log.error("Unexpected SFR state: {0} for {1}".format(sfr['SpotFleetRequestState'], spotfleet_request_id)) raise ClusterAutoscalingError if is_aws_launching_sfr_instances(sfr) and sfr['SpotFleetRequestState'] == 'active': log.warning("AWS hasn't reached the TargetCapacity that is currently set. We won't make any " "changes this time as we should wait for AWS to launch more instances first.") return 0, 0 current, target = get_spot_fleet_delta(resource, error) if sfr['SpotFleetRequestState'] == 'cancelled_running': resource['min_capacity'] = 0 slaves = get_pool_slaves(resource, mesos_state) pool_error = get_mesos_utilization_error(spotfleet_request_id, resource=resource, pool_settings=pool_settings, slaves=slaves, mesos_state=mesos_state) if pool_error > 0: log.info("Not scaling cancelled SFR {0} because we are under provisioned".format(spotfleet_request_id)) return 0, 0 current, target = get_spot_fleet_delta(resource, -1) if target == 1: target = 0 return current, target
def check_mesos_active_frameworks() -> None: options = parse_args() expected = options.expected.split(',') master = get_mesos_master() try: state = block(master.state) except MasterNotAvailableException as e: paasta_print("CRITICAL: %s" % e.args[0]) sys.exit(2) result = assert_frameworks_exist(state, expected) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print(result.message) sys.exit(2)
def check_mesos_no_duplicate_frameworks() -> None: options = parse_args() check = options.check.split(",") master = get_mesos_master() try: state = block(master.state) except MasterNotAvailableException as e: print("CRITICAL: %s" % e.args[0]) sys.exit(2) result = assert_no_duplicate_frameworks(state, check) if result.healthy: print("OK: " + result.message) sys.exit(0) else: print(result.message) sys.exit(2)
def _clean_up_paasta_native_frameworks(context): clear_mesos_tools_cache() # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where # to connect to clean up paasta native frameworks. if hasattr(context, "etc_paasta"): for framework in a_sync.block( mesos_tools.get_mesos_master().frameworks, active_only=True): if framework.name.startswith( "paasta_native ") or framework.name == getattr( context, "framework_name", ""): print("cleaning up framework %s" % framework.name) try: mesos_tools.terminate_framework(framework.id) except requests.exceptions.HTTPError as e: print( f"Got exception when terminating framework {framework.id}: {e}" )
def autoscale_local_cluster(config_folder, dry_run=False, log_level=None): log.debug("Sleep 20s to throttle AWS API calls") time.sleep(20) if dry_run: log.info("Running in dry_run mode, no changes should be made") system_config = load_system_paasta_config() autoscaling_resources = system_config.get_cluster_autoscaling_resources() autoscaling_draining_enabled = system_config.get_cluster_autoscaling_draining_enabled( ) all_pool_settings = system_config.get_resource_pool_settings() mesos_state = get_mesos_master().state utilization_errors = get_all_utilization_errors(autoscaling_resources, all_pool_settings, mesos_state) autoscaling_scalers = defaultdict(list) for identifier, resource in autoscaling_resources.items(): pool_settings = all_pool_settings.get(resource['pool'], {}) try: scaler = get_scaler(resource['type'])( resource=resource, pool_settings=pool_settings, config_folder=config_folder, dry_run=dry_run, log_level=log_level, utilization_error=utilization_errors[(resource['region'], resource['pool'])], draining_enabled=autoscaling_draining_enabled, ) autoscaling_scalers[(resource['region'], resource['pool'])].append(scaler) except KeyError: log.warning( "Couldn't find a metric provider for resource of type: {}". format(resource['type'])) continue log.debug("Sleep 3s to throttle AWS API calls") time.sleep(3) filtered_autoscaling_scalers = filter_scalers(autoscaling_scalers, utilization_errors) sorted_autoscaling_scalers = sort_scalers(filtered_autoscaling_scalers) event_loop = asyncio.get_event_loop() event_loop.run_until_complete( run_parallel_scalers(sorted_autoscaling_scalers, mesos_state)) event_loop.close()
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): killed_slaves = 0 while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: log.info("ALL slaves killed so moving on to next pool!") break log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) if killed_slaves == 0: log.info("This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually") else: break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves) else: filtered_slaves = filtered_sorted_slaves
def resources_utilization(request): master = get_mesos_master() mesos_state = block(master.state) groupings = request.swagger_data.get('groupings', ['superregion']) # swagger actually makes the key None if it's not set if groupings is None: groupings = ['superregion'] grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) sorting_function = metastatus_lib.sort_func_for_attributes(groupings) filters = request.swagger_data.get('filter', []) filters = parse_filters(filters) filter_funcs = [ metastatus_lib.make_filter_slave_func(attr, vals) for attr, vals in filters.items() ] resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=grouping_function, mesos_state=mesos_state, filters=filter_funcs, sort_func=sorting_function, ) response_body = [] for k, v in resource_info_dict.items(): group = {'groupings': {}} for grouping, value in k: group['groupings'][grouping] = value for resource, value in v['total']._asdict().items(): group[resource] = {'total': value} for resource, value in v['free']._asdict().items(): group[resource]['free'] = value for resource in v['free']._fields: group[resource][ 'used'] = group[resource]['total'] - group[resource]['free'] response_body.append(group) return Response(json_body=response_body, status_code=200)
def unreserve_all_resources(hostnames): """Dynamically unreserve all available resources on the specified hosts :param hostnames: list of hostnames to unreserve resources on """ mesos_state = get_mesos_master().state_summary() components = hostnames_to_components(hostnames) hosts = components_to_hosts(components) known_slaves = [slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts] for slave in known_slaves: hostname = slave['hostname'] log.info("Unreserving all resources on %s" % hostname) slave_id = slave['id'] resources = [] for role in slave['reserved_resources']: for resource in ['disk', 'mem', 'cpus']: reserved_resource = slave['reserved_resources'][role][resource] resources.append(Resource(name=resource, amount=reserved_resource)) try: unreserve(slave_id=slave_id, resources=resources) except HTTPError: raise HTTPError("Failed unreserving all of the resources on %s (%s). Aborting." % (hostname, slave_id))
def check_registration(threshold_percentage): mesos_state = get_mesos_master().state autoscaling_resources = load_system_paasta_config().get_cluster_autoscaling_resources() for resource in autoscaling_resources.values(): if resource['type'] == 'aws_spot_fleet_request': resource['sfr'] = get_sfr(resource['id'], region=resource['region']) instances = get_spot_fleet_instances(resource['id'], region=resource['region']) resource['sfr']['ActiveInstances'] = instances slaves = get_sfr_slaves(resource, mesos_state) if len(instances) == 0: continue else: percent_registered = float(float(len(slaves)) / float(len(instances))) * 100 if percent_registered < float(threshold_percentage): print "CRIT: Only found {0}% of instances in {1} registered in mesos. "\ "Please check for puppet or AMI baking problems!".format(percent_registered, resource['id']) return False print "OK: Found more than {0}% of instances registered for all paasta resources in this "\ "superregion".format(threshold_percentage) return True
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): killed_slaves = 0 while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: log.info("ALL slaves killed so moving on to next pool!") break log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) if resource['sfr']['SpotFleetRequestState'] == 'cancelled_running' and killed_slaves == 0: log.info("This is a cancelled SFR so we must kill at least one slave to prevent it lingering") else: break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves) else: filtered_slaves = filtered_sorted_slaves
def reserve_all_resources(hostnames): """Dynamically reserve all available resources on the specified hosts :param hostnames: list of hostnames to reserve resources on """ mesos_state = a_sync.block(get_mesos_master().state_summary) components = hostnames_to_components(hostnames) hosts = components_to_hosts(components) known_slaves = [slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts] for slave in known_slaves: hostname = slave['hostname'] log.info("Reserving all resources on %s" % hostname) slave_id = slave['id'] resources = [] for resource in ['disk', 'mem', 'cpus']: free_resource = slave['resources'][resource] - slave['used_resources'][resource] for role in slave['reserved_resources']: free_resource -= slave['reserved_resources'][role][resource] resources.append(Resource(name=resource, amount=free_resource)) try: reserve(slave_id=slave_id, resources=resources) except HTTPError: raise HTTPError(f"Failed reserving all of the resources on {hostname} ({slave_id}). Aborting.")
def check_mesos_no_duplicate_frameworks(): master = get_mesos_master() try: state = master.state except MasterNotAvailableException as e: paasta_print("CRITICAL: %s" % e.message) sys.exit(2) system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) marathon_framework_ids = get_marathon_framework_ids(marathon_clients) result = assert_framework_count( state=state, marathon_framework_ids=marathon_framework_ids, ) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print("CRITICAL: %s" % result.message) sys.exit(2)
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, pool_settings, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param pool_settings: pool settings dict with timeout settings :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region']) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=resource['pool']) filtered_slaves = filter_sfr_slaves(slaves_list, resource) killable_capacity = sum([slave['instance_weight'] for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return downscale_spot_fleet_request(resource=resource, filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity, pool_settings=pool_settings, dry_run=dry_run)
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs["use_mesos_cache"] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks(mesos_master=master, mesos_state=mesos_state) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % "\n".join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [ metastatus_lib.HealthCheckResult( message="Marathon is not configured to run here", healthy=True) ] all_mesos_results = [ metastatus_lib.HealthCheckResult( message="Mesos is not configured to run here", healthy=True) ] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [ metastatus_lib.HealthCheckResult( message="Kubernetes is not configured to run here", healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) kube_summary = metastatus_lib.generate_summary_for_check( "Kubernetes", kube_ok) healthy_exit = True if all([mesos_ok, marathon_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") paasta_print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1 and mesos_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Slave Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose) if args.verbose > 1 and kube_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("No autoscaling resources for Kubernetes", 2) if args.verbose >= 3: print_with_indent("Per Node Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: raise FatalError(2)
def get_mesos_state(): state = get_mesos_master(use_mesos_cache=True).state return state
def main(): marathon_config = None chronos_config = None args = parse_args() master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False print "Master paasta_tools version: {0}".format(__version__) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function, mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(argv=None): chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = master.state all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, marathon_clients=marathon_clients, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = functools.reduce( lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(mesos_state), [headers], ) for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state, ) all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)