def process_default(self, event): self.log.debug(event) self.watch_new_folder(event) event = self.filter_event(event) if event: self.log.debug("Public config changed on disk, loading new config") try: new_config = load_system_paasta_config() except ValueError: self.log.error("Couldn't load public config, the JSON is invalid!") return service_instances = [] if new_config != self.public_config: self.log.info("Public config has changed, now checking if it affects any services config shas") self.public_config = new_config all_service_instances = get_services_for_cluster(cluster=self.public_config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR) service_instances = get_service_instances_with_changed_id(self.marathon_client, all_service_instances, self.public_config.get_cluster()) if service_instances: self.log.info("Found config change affecting {} service instances, " "now doing a staggered bounce".format(len(service_instances))) bounce_rate = self.public_config.get_deployd_big_bounce_rate() service_instances = rate_limit_instances(instances=service_instances, number_per_minute=bounce_rate, watcher_name=self.__class__.__name__) for service_instance in service_instances: self.filewatcher.inbox_q.put(service_instance)
def main() -> None: system_paasta_config = load_system_paasta_config() kube_client = KubeClient() services = { service for service, instance in get_services_for_cluster( cluster=system_paasta_config.get_cluster(), instance_type="kubernetes") } for service in services: pscl = PaastaServiceConfigLoader(service=service, load_deployments=False) for instance_config in pscl.instance_configs( cluster=system_paasta_config.get_cluster(), instance_type_class=KubernetesDeploymentConfig, ): max_instances = instance_config.get_max_instances() if max_instances is not None: formatted_application = instance_config.format_kubernetes_app() formatted_application.spec.replicas = max_instances wrapper = get_application_wrapper(formatted_application) wrapper.soa_config = instance_config print(f"Scaling up {service}.{instance_config.instance}") wrapper.update(kube_client)
def get_desired_marathon_configs(soa_dir): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster(instance_type="marathon", cluster=cluster, soa_dir=soa_dir) job_configs = dict() formatted_marathon_configs = dict() for service, instance in instances: try: job_config = load_marathon_service_config(service=service, instance=instance, cluster=cluster, soa_dir=soa_dir) formatted_config = job_config.format_marathon_app_dict() formatted_marathon_configs[formatted_config["id"].lstrip( "/")] = formatted_config job_configs[formatted_config["id"].lstrip("/")] = job_config # Not ideal but we rely on a lot of user input to create the app dict # and we really can't afford to bail if just one app definition is malformed except Exception as errormsg: _log( service=service, line=str(errormsg), component="deploy", level="debug", cluster=cluster, instance=instance, ) return formatted_marathon_configs, job_configs
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir, ) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) all_tasks = client.list_tasks() mesos_slaves = get_slaves() smartstack_replication_checker = SmartstackReplicationChecker( mesos_slaves, system_paasta_config) for service, instance in service_instances: check_service_replication( service=service, instance=instance, cluster=cluster, all_tasks=all_tasks, soa_dir=args.soa_dir, smartstack_replication_checker=smartstack_replication_checker, )
def get_paasta_native_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR): """A paasta_native-specific wrapper around utils.get_services_for_cluster :param cluster: The cluster to read the configuration for :param soa_dir: The SOA config directory to read from :returns: A list of tuples of (service, job_name)""" return get_services_for_cluster(cluster, 'paasta_native', soa_dir)
def test_get_services_for_cluster(): cluster = 'honey_bunches_of_oats' soa_dir = 'completely_wholesome' instances = [['this_is_testing', 'all_the_things'], ['my_nerf_broke']] expected = ['my_nerf_broke', 'this_is_testing', 'all_the_things'] with contextlib.nested( mock.patch('os.path.abspath', autospec=True, return_value='chex_mix'), mock.patch('os.listdir', autospec=True, return_value=['dir1', 'dir2']), mock.patch('paasta_tools.utils.get_service_instance_list', side_effect=lambda a, b, c, d: instances.pop()), ) as ( abspath_patch, listdir_patch, get_instances_patch, ): actual = utils.get_services_for_cluster(cluster, soa_dir=soa_dir) assert expected == actual abspath_patch.assert_called_once_with(soa_dir) listdir_patch.assert_called_once_with('chex_mix') get_instances_patch.assert_any_call('dir1', cluster, None, soa_dir) get_instances_patch.assert_any_call('dir2', cluster, None, soa_dir) assert get_instances_patch.call_count == 2
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def get_desired_marathon_configs(soa_dir): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster( instance_type='marathon', cluster=cluster, soa_dir=soa_dir, ) marathon_configs = dict() for service, instance in instances: try: marathon_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).format_marathon_app_dict() marathon_configs[marathon_config['id'].lstrip( '/')] = marathon_config except NoSlavesAvailableError as errormsg: _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) except (NoDeploymentsAvailable, NoDockerImageError): pass return marathon_configs
def main(): args = parse_args() soa_dir = args.soa_dir logging.basicConfig() if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) cluster = load_system_paasta_config().get_cluster() service_instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir) config = marathon_tools.load_marathon_config() client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password()) for service, instance in service_instances: check_service_replication( client=client, service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, )
def emit_metrics_for_type(instance_type): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster( cluster=cluster, instance_type=instance_type, ) for service, instance in instances: service_instance_config = get_instance_config( service=service, instance=instance, cluster=cluster, ) dimensions = { 'paasta_service': service_instance_config.service, 'paasta_cluster': service_instance_config.cluster, 'paasta_instance': service_instance_config.instance, } log.info(f"Emitting paasta.service.* with dimensions {dimensions}") gauge = yelp_meteorite.create_gauge('paasta.service.cpus', dimensions) gauge.set(service_instance_config.get_cpus()) gauge = yelp_meteorite.create_gauge('paasta.service.mem', dimensions) gauge.set(service_instance_config.get_mem()) gauge = yelp_meteorite.create_gauge('paasta.service.disk', dimensions) gauge.set(service_instance_config.get_disk()) if hasattr(service_instance_config, 'get_instances'): if service_instance_config.get_max_instances() is None: gauge = yelp_meteorite.create_gauge('paasta.service.instances', dimensions) gauge.set(service_instance_config.get_instances())
def emit_metrics_for_type(instance_type): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster(cluster=cluster, instance_type=instance_type) for service, instance in instances: service_instance_config = get_instance_config( service=service, instance=instance, cluster=cluster ) dimensions = { "paasta_service": service_instance_config.service, "paasta_cluster": service_instance_config.cluster, "paasta_instance": service_instance_config.instance, "paasta_pool": service_instance_config.get_pool(), } log.info(f"Emitting paasta.service.* with dimensions {dimensions}") gauge = yelp_meteorite.create_gauge("paasta.service.cpus", dimensions) gauge.set(service_instance_config.get_cpus()) gauge = yelp_meteorite.create_gauge("paasta.service.mem", dimensions) gauge.set(service_instance_config.get_mem()) gauge = yelp_meteorite.create_gauge("paasta.service.disk", dimensions) gauge.set(service_instance_config.get_disk()) if hasattr(service_instance_config, "get_instances"): if service_instance_config.get_max_instances() is None: gauge = yelp_meteorite.create_gauge( "paasta.service.instances", dimensions ) gauge.set(service_instance_config.get_instances())
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) service_instances = get_service_instances_that_need_bouncing( marathon_client=marathon_client, soa_dir=soa_dir, ) else: instances = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) paasta_print('\n'.join(service_instances)) sys.exit(0)
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances( ) and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() all_marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks( '') # empty string matches all app ids with ZookeeperPool(): for config in configs: if config.get_autoscaling_params( )['decision_policy'] != 'bespoke': try: job_id = format_job_id(config.service, config.instance) marathon_tasks = { task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks" ) mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: raise e write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def get_chronos_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR): """A chronos-specific wrapper around utils.get_services_for_cluster :param cluster: The cluster to read the configuration for :param soa_dir: The SOA config directory to read from :returns: A list of tuples of (service, job_name)""" return get_services_for_cluster(cluster, 'chronos', soa_dir)
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR): services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: try: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "%s is not deployed yet, refusing to do autoscaling calculations for it" % compose_job_id(service, instance)) continue if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) return configs
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR): services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: try: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug("%s is not deployed yet, refusing to do autoscaling calculations for it" % compose_job_id(service, instance)) continue if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) return configs
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() mesos_tasks = get_running_tasks_from_active_frameworks('') for config in configs: try: autoscale_marathon_instance(config, marathon_tasks, mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e, level='event') except LockHeldException: pass
def main(): args = parse_args() instances = get_services_for_cluster(cluster=args.cluster, instance_type='marathon', soa_dir=args.soa_dir) composed = [] for name, instance in instances: composed.append(compose_job_id(name, instance)) print '\n'.join(composed) sys.exit(0)
def add_all_services(self): instances = get_services_for_cluster(cluster=self.config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR) instances_to_add = rate_limit_instances(instances=instances, number_per_minute=self.config.get_deployd_startup_bounce_rate(), watcher_name='daemon_start') for service_instance in instances_to_add: self.inbox_q.put(service_instance)
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float( len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True if service.startswith(TMP_JOB_IDENTIFIER): paasta_print(( "Services using scheduled tasks cannot be named %s, as it clashes with the " "identifier used for temporary jobs" % TMP_JOB_IDENTIFIER)) return False for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = { f"{name}{chronos_spacer}{instance}" for name, instance in services_in_cluster } for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir, ): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if f"{service}{chronos_spacer}{instance}" == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: paasta_print( invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs))) returncode = False else: paasta_print(valid_chronos_instance(cluster, instance)) return returncode
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster instances = get_services_for_cluster( cluster=cluster, instance_type="kubernetes", soa_dir=soa_dir ) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) paasta_print("\n".join(service_instances)) sys.exit(0)
def create_marathon_dashboard( cluster: str, soa_dir: str = DEFAULT_SOA_DIR, marathon_clients: MarathonClients = None, system_paasta_config: SystemPaastaConfig = None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers( system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients( marathon_servers=marathon_servers, cached=False) for service_instance in instances: service: str = service_instance[0] instance: str = service_instance[1] service_config: MarathonServiceConfig = load_marathon_service_config( service=service, instance=instance, cluster=cluster, load_deployments=False, soa_dir=soa_dir, ) client: MarathonClient = marathon_clients.get_current_client_for_service( job_config=service_config) dashboard_links: Dict = system_paasta_config.get_dashboard_links() shard_url: str = client.servers[0] if 'Marathon RO' in dashboard_links[cluster]: marathon_links = dashboard_links[cluster]['Marathon RO'] if isinstance(marathon_links, list): for shard_number, shard in enumerate(marathon_servers.current): if shard.url[0] == shard_url: shard_url = marathon_links[shard_number] elif isinstance(marathon_links, str): shard_url = marathon_links.split(' ')[0] service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def add_all_services(self): instances = get_services_for_cluster( cluster=self.config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR, ) instances_to_add = rate_limit_instances( instances=instances, cluster=self.config.get_cluster(), number_per_minute=self.config.get_deployd_startup_bounce_rate(), watcher_name='daemon_start', priority=99, ) for service_instance in instances_to_add: self.instances_that_need_to_be_bounced_in_the_future.put( service_instance)
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster instances = get_services_for_cluster(cluster=cluster, instance_type="kubernetes", soa_dir=soa_dir) service_instances = [] for name, instance in instances: if args.sanitise: app_name = kubernetes_tools.get_kubernetes_app_name(name, instance) else: app_name = compose_job_id(name, instance) service_instances.append(app_name) print("\n".join(service_instances)) sys.exit(0)
def process_default(self, event: pyinotify.Event) -> None: self.log.debug(event) self.watch_new_folder(event) event = self.filter_event(event) if event: self.log.debug( "Public config changed on disk, loading new config.") try: new_config = load_system_paasta_config() except ValueError: self.log.error( "Couldn't load public config, the JSON is invalid!") return service_instance_configs: List[Tuple[str, str, MarathonServiceConfig, str]] = [] if new_config != self.public_config: self.log.info( "Public config has changed, now checking if it affects any services config shas." ) self.public_config = new_config all_service_instances = get_services_for_cluster( cluster=self.public_config.get_cluster(), instance_type="marathon", soa_dir=DEFAULT_SOA_DIR, ) service_instance_configs = get_service_instances_needing_update( self.marathon_clients, all_service_instances, self.public_config.get_cluster(), ) if service_instance_configs: self.log.info( f"{len(service_instance_configs)} service instances affected. Doing a staggered bounce." ) for service, instance, config, _ in service_instance_configs: self.filewatcher.instances_to_bounce.put( ServiceInstance( service=service, instance=instance, watcher=type(self).__name__, bounce_by=time.time() + self.public_config. get_deployd_big_bounce_deadline(), wait_until=time.time(), enqueue_time=time.time(), bounce_start_time=time.time(), ))
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = set([ "%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster ]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical("Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def main(sys_argv): args = parse_args(sys_argv[1:]) cluster = load_system_paasta_config().get_cluster() victims = latest_oom_events(cluster, args.superregion) for (service, instance) in get_services_for_cluster(cluster, soa_dir=args.soa_dir): try: instance_config = get_instance_config( service=service, instance=instance, cluster=cluster, load_deployments=False, soa_dir=args.soa_dir, ) oom_events = victims.get((service, instance), []) send_sensu_event(instance_config, oom_events, args) except NotImplementedError: # When instance_type is not supported by get_instance_config pass
def test_get_services_for_cluster(): cluster = "honey_bunches_of_oats" soa_dir = "completely_wholesome" instances = [["this_is_testing", "all_the_things"], ["my_nerf_broke"]] expected = ["my_nerf_broke", "this_is_testing", "all_the_things"] with contextlib.nested( mock.patch("os.path.abspath", autospec=True, return_value="chex_mix"), mock.patch("os.listdir", autospec=True, return_value=["dir1", "dir2"]), mock.patch("paasta_tools.utils.get_service_instance_list", side_effect=lambda a, b, c, d: instances.pop()), ) as (abspath_patch, listdir_patch, get_instances_patch): actual = utils.get_services_for_cluster(cluster, soa_dir=soa_dir) assert expected == actual abspath_patch.assert_called_once_with(soa_dir) listdir_patch.assert_called_once_with("chex_mix") get_instances_patch.assert_any_call("dir1", cluster, None, soa_dir) get_instances_patch.assert_any_call("dir2", cluster, None, soa_dir) assert get_instances_patch.call_count == 2
def cleanup_unused_apps(soa_dir: str, kill_threshold: float = 0.5, force: bool = False) -> None: """Clean up old or invalid jobs/apps from kubernetes. Retrieves both a list of apps currently in kubernetes and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Creating KubeClient") kube_client = KubeClient() log.info("Loading running Kubernetes apps") applications = list_namespaced_applications(kube_client, "paasta", APPLICATION_TYPES) log.info("Retrieving valid apps from yelpsoa_configs") valid_services = set( get_services_for_cluster(instance_type="kubernetes", soa_dir=soa_dir)) log.info("Determining apps to be killed") applications_to_kill = [ applicaton for applicaton in applications if (applicaton.kube_deployment.service, applicaton.kube_deployment.instance) not in valid_services ] log.debug("Running apps: %s" % applications) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % applications_to_kill) if applications_to_kill: above_kill_threshold = float(len(applications_to_kill)) / float( len(applications)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for applicaton in applications_to_kill: with alert_state_change(applicaton, soa_dir): applicaton.deep_delete(kube_client)
def create_prometheus_adapter_config(paasta_cluster: str, soa_dir: Path) -> PrometheusAdapterConfig: """ Given a paasta cluster and a soaconfigs directory, create the necessary Prometheus adapter config to autoscale services. Currently supports the following metrics providers: * uwsgi """ rules: List[PrometheusAdapterRule] = [] # get_services_for_cluster() returns a list of (service, instance) tuples, but this # is not great for us: if we were to iterate over that we'd end up getting duplicates # for every service as PaastaServiceConfigLoader does not expose a way to get configs # for a single instance by name. instead, we get the unique set of service names and then # let PaastaServiceConfigLoader iterate over instances for us later services = { service_name for service_name, _ in get_services_for_cluster( cluster=paasta_cluster, instance_type="kubernetes", soa_dir=str(soa_dir)) } for service_name in services: config_loader = PaastaServiceConfigLoader(service=service_name, soa_dir=str(soa_dir)) for instance_config in config_loader.instance_configs( cluster=paasta_cluster, instance_type_class=KubernetesDeploymentConfig, ): rules.extend( get_rules_for_service_instance( service_name=service_name, instance_name=instance_config.instance, autoscaling_config=instance_config.get_autoscaling_params( ), paasta_cluster=paasta_cluster, )) return { # we sort our rules so that we can easily compare between two different configmaps # as otherwise we'd need to do fancy order-independent comparisons between the two # sets of rules later due to the fact that we're not iterating in a deterministic # way and can add rules in any arbitrary order "rules": sorted(rules, key=lambda rule: rule["name"]["as"]), }
def process_default(self, event): self.log.debug(event) self.watch_new_folder(event) event = self.filter_event(event) if event: self.log.debug( "Public config changed on disk, loading new config.") try: new_config = load_system_paasta_config() except ValueError: self.log.error( "Couldn't load public config, the JSON is invalid!") return service_instances: List[Tuple[str, str]] = [] if new_config != self.public_config: self.log.info( "Public config has changed, now checking if it affects any services config shas." ) self.public_config = new_config all_service_instances = get_services_for_cluster( cluster=self.public_config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR, ) service_instances = get_service_instances_needing_update( self.marathon_clients, all_service_instances, self.public_config.get_cluster(), ) if service_instances: self.log.info( f"{len(service_instances)} service instances affected. Doing a staggered bounce." ) bounce_rate = self.public_config.get_deployd_big_bounce_rate() for service_instance in rate_limit_instances( instances=service_instances, cluster=self.public_config.get_cluster(), number_per_minute=bounce_rate, watcher_name=type(self).__name__, priority=99, ): self.filewatcher.instances_that_need_to_be_bounced_in_the_future.put( service_instance)
def add_all_services(self) -> None: instances = get_services_for_cluster( cluster=self.config.get_cluster(), instance_type="marathon", soa_dir=DEFAULT_SOA_DIR, ) for service, instance in instances: self.instances_to_bounce.put( ServiceInstance( service=service, instance=instance, watcher="daemon_start", bounce_by=time.time() + self.config.get_deployd_startup_bounce_deadline(), wait_until=time.time(), failures=0, bounce_start_time=time.time(), enqueue_time=time.time(), ))
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = get_marathon_clients(marathon_servers) service_instances = get_service_instances_that_need_bouncing( marathon_clients=marathon_clients, soa_dir=soa_dir) else: instances = get_services_for_cluster(cluster=cluster, instance_type="marathon", soa_dir=soa_dir) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) print("\n".join(service_instances)) sys.exit(0)
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = 'chronos' chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True if service.startswith(TMP_JOB_IDENTIFIER): print ("Services using scheduled tasks cannot be named %s, as it clashes with the" " identifier used for temporary jobs" % TMP_JOB_IDENTIFIER) return False for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir) valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster if args.minimal: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) service_instances = get_service_instances_that_need_bouncing( marathon_client=marathon_client, soa_dir=soa_dir) else: instances = get_services_for_cluster(cluster=cluster, instance_type='marathon', soa_dir=soa_dir) service_instances = [] for name, instance in instances: service_instances.append(compose_job_id(name, instance)) print '\n'.join(service_instances) sys.exit(0)
def get_desired_marathon_configs(soa_dir): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster( instance_type='marathon', cluster=cluster, soa_dir=soa_dir, ) marathon_configs = dict() for service, instance in instances: try: marathon_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).format_marathon_app_dict() marathon_configs[marathon_config['id'].lstrip('/')] = marathon_config except NoDockerImageError: # This service hasn't been deployed yet pass return marathon_configs
def validate_chronos(service_path): """Check that any chronos configurations are valid""" soa_dir, service = path_to_soa_dir_service(service_path) instance_type = "chronos" chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER returncode = True for cluster in list_clusters(service, soa_dir, instance_type): services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type="chronos", soa_dir=soa_dir) valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster]) for instance in list_all_instances_for_service( service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir ): cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir) parents = cjc.get_parents() or [] checks_passed, check_msgs = cjc.validate() for parent in parents: if not check_parent_format(parent): continue if "%s%s%s" % (service, chronos_spacer, instance) == parent: checks_passed = False check_msgs.append("Job %s cannot depend on itself" % parent) elif parent not in valid_services: checks_passed = False check_msgs.append("Parent job %s could not be found" % parent) # Remove duplicate check_msgs unique_check_msgs = list(set(check_msgs)) if not checks_passed: print invalid_chronos_instance(cluster, instance, "\n ".join(unique_check_msgs)) returncode = False else: print valid_chronos_instance(cluster, instance) return returncode