def setup_paasta_api(): if os.environ.get("PAASTA_API_DEBUG"): logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # pyinotify is a better solution than turning off file caching completely service_configuration_lib.disable_yaml_cache() settings.system_paasta_config = load_system_paasta_config() settings.cluster = settings.system_paasta_config.get_cluster() settings.marathon_clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(settings.system_paasta_config), ) settings.marathon_servers = marathon_tools.get_marathon_servers(system_paasta_config=settings.system_paasta_config) settings.marathon_clients = marathon_tools.get_marathon_clients( marathon_servers=settings.marathon_servers, cached=False, ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=5)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return try: with create_autoscaling_lock(): system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = get_all_running_tasks() if configs: with ZookeeperPool(): for config in configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, ) autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning( "Skipping autoscaling run for services because the lock is held")
def autoscale_service_configs( service_configs: Sequence[MarathonServiceConfig], system_paasta_config: SystemPaastaConfig, ) -> None: if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = a_sync.block(get_all_running_tasks) with ZookeeperPool(): for config in service_configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, system_paasta_config, ) autoscale_marathon_instance( config, system_paasta_config, list(marathon_tasks.values()), mesos_tasks, ) except Exception as e: write_to_log(config=config, line="Caught Exception %s" % e, level="debug")
def setup(self) -> None: system_paasta_config = load_system_paasta_config() self.marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config) self.marathon_clients = marathon_tools.get_marathon_clients( self.marathon_servers) self.max_failures = ( system_paasta_config.get_deployd_max_service_instance_failures())
def marathon(self) -> marathon_tools.MarathonClients: if self._marathon is None: system_paasta_config = load_system_paasta_config() marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config) self._marathon = marathon_tools.get_marathon_clients( marathon_servers, cached=True) return self._marathon
def setup(self) -> None: system_paasta_config = load_system_paasta_config() self.marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config ) self.marathon_clients = marathon_tools.get_marathon_clients( self.marathon_servers )
def main() -> None: """Attempt to set up a list of marathon service instances given. Exits 1 if any service.instance deployment failed. This is done in the following order: - Load the marathon configuration - Connect to marathon - Do the following for each service.instance: - Load the service instance's configuration - Create the complete marathon job configuration - Deploy/bounce the service - Emit an event about the deployment to sensu""" args = parse_args() soa_dir = args.soa_dir if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # Setting up transparent cache for http API calls requests_cache.install_cache("setup_marathon_jobs", backend="memory") system_paasta_config = load_system_paasta_config() clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config) ) unique_clients = clients.get_all_clients() marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( unique_clients, embed_tasks=True ) num_failed_deployments = 0 for service_instance in args.service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) except InvalidJobNameError: log.error( "Invalid service instance specified. Format is service%sinstance." % SPACER ) num_failed_deployments = num_failed_deployments + 1 else: if deploy_marathon_service( service, instance, clients, soa_dir, marathon_apps_with_clients )[0]: num_failed_deployments = num_failed_deployments + 1 requests_cache.uninstall_cache() log.debug( "%d out of %d service.instances failed to deploy." % (num_failed_deployments, len(args.service_instance_list)) ) sys.exit(1 if num_failed_deployments else 0)
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") system_paasta_config = load_system_paasta_config() log.info("Connecting to marathon") clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config)) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) all_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients.get_all_clients()) app_ids_with_clients = [] for (app, client) in all_apps_with_clients: try: app_id = marathon_tools.deformat_job_id(app.id.lstrip('/')) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app.id) continue app_ids_with_clients.append((app_id, client)) apps_to_kill = [((service, instance, git_sha, config_sha), client) for (service, instance, git_sha, config_sha), client in app_ids_with_clients if (service, instance) not in valid_services] log.debug("Running apps: %s" % app_ids_with_clients) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if app_ids_with_clients: above_kill_threshold = float(len(apps_to_kill)) / float( len(app_ids_with_clients)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for id_tuple, client in apps_to_kill: app_id = marathon_tools.format_job_id(*id_tuple) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def get_mesos_tasks_and_slaves( system_paasta_config: SystemPaastaConfig, ) -> Tuple[Sequence[MarathonTask], List[Any]]: clients = get_marathon_clients(get_marathon_servers(system_paasta_config)) all_clients: Sequence[MarathonClient] = clients.get_all_clients() all_tasks: List[MarathonTask] = [] for client in all_clients: all_tasks.extend(client.list_tasks()) mesos_slaves = a_sync.block(get_slaves) return all_tasks, mesos_slaves
def paasta_sysdig(args): system_paasta_config = load_system_paasta_config() if not args.local: mesos_master = get_any_mesos_master( cluster=args.cluster, system_paasta_config=system_paasta_config) ssh_cmd = ('ssh -At -o StrictHostKeyChecking=no -o LogLevel=QUIET {0} ' '"sudo paasta {1} --local"').format(mesos_master, ' '.join(sys.argv[1:])) return_code, output = _run(ssh_cmd) if return_code != 0: paasta_print(output) sys.exit(return_code) slave, command = output.split(':', 1) subprocess.call( shlex.split("ssh -tA {} '{}'".format(slave, command.strip()))) return status = get_status_for_instance( cluster=args.cluster, service=args.service, instance=args.instance, ) slave = pick_slave_from_status( status=status, host=args.host, ) job_config = load_marathon_service_config( service=args.service, instance=args.instance, cluster=args.cluster, ) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) # Unfortunately, sysdig seems to only be able to take one marathon URL, so hopefully the service in question is not # currently moving between shards. client = marathon_clients.get_current_client_for_service( job_config=job_config, ) marathon_url = client.servers[0] marathon_user, marathon_pass = client.auth mesos_url = get_mesos_master().host marathon_parsed_url = urlparse(marathon_url) marathon_creds_url = marathon_parsed_url._replace(netloc="{}:{}@{}".format( marathon_user, marathon_pass, marathon_parsed_url.netloc, )) paasta_print( format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl()))
def setup_paasta_api(): if os.environ.get("PAASTA_API_DEBUG"): logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) # pyinotify is a better solution than turning off file caching completely service_configuration_lib.disable_yaml_cache() settings.system_paasta_config = load_system_paasta_config() if os.environ.get("PAASTA_API_CLUSTER"): settings.cluster = os.environ.get("PAASTA_API_CLUSTER") else: settings.cluster = settings.system_paasta_config.get_cluster() settings.marathon_clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(settings.system_paasta_config), ) settings.marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config=settings.system_paasta_config) settings.marathon_clients = marathon_tools.get_marathon_clients( marathon_servers=settings.marathon_servers, cached=False, ) try: settings.kubernetes_client = kubernetes_tools.KubeClient() except FileNotFoundError: log.info('Kubernetes not found') settings.kubernetes_client = None except Exception: log.exception('Error while initializing KubeClient') settings.kubernetes_client = None # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=5)
def create_marathon_dashboard( cluster: str, soa_dir: str = DEFAULT_SOA_DIR, marathon_clients: MarathonClients = None, system_paasta_config: SystemPaastaConfig = None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers( system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients( marathon_servers=marathon_servers, cached=False) for service_instance in instances: service: str = service_instance[0] instance: str = service_instance[1] service_config: MarathonServiceConfig = load_marathon_service_config( service=service, instance=instance, cluster=cluster, load_deployments=False, soa_dir=soa_dir, ) client: MarathonClient = marathon_clients.get_current_client_for_service( job_config=service_config) dashboard_links: Dict = system_paasta_config.get_dashboard_links() shard_url: str = client.servers[0] if 'Marathon RO' in dashboard_links[cluster]: marathon_links = dashboard_links[cluster]['Marathon RO'] if isinstance(marathon_links, list): for shard_number, shard in enumerate(marathon_servers.current): if shard.url[0] == shard_url: shard_url = marathon_links[shard_number] elif isinstance(marathon_links, str): shard_url = marathon_links.split(' ')[0] service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def test_list_instances(): settings.cluster = 'fake_cluster' system_paasta_config_dict = { "marathon_servers": [ { "user": "******", "password": "******", "url": [ "http://marathon:8080", ], }, { "user": "******", "password": "******", "url": [ "http://marathon1:8080", ], }, { "user": "******", "password": "******", "url": [ "http://marathon2:8080", ], }, ], "dashboard_links": { "testcluster": { "Marathon RO": [ "http://accessible-marathon", "http://accessible-marathon1", "http://accessible-marathon2", ], }, }, } system_paasta_config = SystemPaastaConfig(config=system_paasta_config_dict, directory='unused') marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config) settings.marathon_clients = marathon_tools.get_marathon_clients( marathon_servers=marathon_servers, cached=False, ) request = testing.DummyRequest() settings.system_paasta_config = system_paasta_config response = marathon_dashboard(request) expected_output = {settings.cluster: []} assert response == expected_output
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) service_instances = get_service_instances_that_need_bouncing( marathon_clients=marathon_clients, soa_dir=soa_dir) else: instances = get_services_for_cluster(cluster=cluster, instance_type="marathon", soa_dir=soa_dir) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) print("\n".join(service_instances)) sys.exit(0)
def check_mesos_no_duplicate_frameworks(): master = get_mesos_master() try: state = master.state except MasterNotAvailableException as e: paasta_print("CRITICAL: %s" % e.message) sys.exit(2) system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) marathon_framework_ids = get_marathon_framework_ids(marathon_clients) result = assert_framework_count( state=state, marathon_framework_ids=marathon_framework_ids, ) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print("CRITICAL: %s" % result.message) sys.exit(2)
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config)) all_clients = clients.get_all_clients() all_tasks = [] for client in all_clients: all_tasks.extend(client.list_tasks()) mesos_slaves = a_sync.block(get_slaves) smartstack_replication_checker = MesosSmartstackReplicationChecker( mesos_slaves, system_paasta_config) for service in list_services(soa_dir=args.soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=args.soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=marathon_tools.MarathonServiceConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_tasks=all_tasks, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )
def main(argv: Optional[List[str]] = None) -> None: chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % '\n'.join(e.args))) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, ) ] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: print_with_indent( 'Resources Grouped by %s' % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state, ) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def setup_marathon_clients(): system_paasta_config = setup_system_paasta_config() marathon_servers = marathon_tools.get_marathon_servers( system_paasta_config) clients = marathon_tools.get_marathon_clients(marathon_servers) return (clients, marathon_servers, system_paasta_config)
def create_marathon_dashboard( cluster: str, soa_dir: str=DEFAULT_SOA_DIR, marathon_clients: MarathonClients=None, system_paasta_config: SystemPaastaConfig=None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False) dashboard_links: Dict = system_paasta_config.get_dashboard_links() marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO') # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com' shard_url_to_marathon_link_dict: Dict[str, str] = {} if isinstance(marathon_links, list): # Sanity check and log error if necessary if len(marathon_links) != len(marathon_servers.current): log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern') for shard_number, shard in enumerate(marathon_servers.current): shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number] elif isinstance(marathon_links, str): # In this case, the shard url will be the same for every service instance static_shard_url = marathon_links.split(' ')[0] return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]} # Setup with service as key since will instantiate 1 PSCL per service service_instances_dict: Dict[str, Set[str]] = defaultdict(set) for si in instances: service, instance = si[0], si[1] service_instances_dict[service].add(instance) for service, instance_set in service_instances_dict.items(): pscl = PaastaServiceConfigLoader( service=service, soa_dir=soa_dir, load_deployments=False, ) for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig): if marathon_service_config.get_instance() in instance_set: client: MarathonClient = \ marathon_clients.get_current_client_for_service(job_config=marathon_service_config) ip_url: str = client.servers[0] # Convert to a marathon link if possible else default to the originalIP address shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url) service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs["use_mesos_cache"] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks(mesos_master=master, mesos_state=mesos_state) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % "\n".join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [ metastatus_lib.HealthCheckResult( message="Marathon is not configured to run here", healthy=True) ] all_mesos_results = [ metastatus_lib.HealthCheckResult( message="Mesos is not configured to run here", healthy=True) ] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [ metastatus_lib.HealthCheckResult( message="Kubernetes is not configured to run here", healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) kube_summary = metastatus_lib.generate_summary_for_check( "Kubernetes", kube_ok) healthy_exit = True if all([mesos_ok, marathon_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") paasta_print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1 and mesos_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Slave Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose) if args.verbose > 1 and kube_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("No autoscaling resources for Kubernetes", 2) if args.verbose >= 3: print_with_indent("Per Node Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: raise FatalError(2)
def get_marathon_clients_from_config() -> MarathonClients: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) return marathon_clients
def main(argv=None): chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = master.state all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, marathon_clients=marathon_clients, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = functools.reduce( lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(mesos_state), [headers], ) for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state, ) all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)