def main() -> None: system_paasta_config = load_system_paasta_config() kube_client = KubeClient() services = { service for service, instance in get_services_for_cluster( cluster=system_paasta_config.get_cluster(), instance_type="kubernetes") } for service in services: pscl = PaastaServiceConfigLoader(service=service, load_deployments=False) for instance_config in pscl.instance_configs( cluster=system_paasta_config.get_cluster(), instance_type_class=KubernetesDeploymentConfig, ): max_instances = instance_config.get_max_instances() if max_instances is not None: formatted_application = instance_config.format_kubernetes_app() formatted_application.spec.replicas = max_instances wrapper = get_application_wrapper(formatted_application) wrapper.soa_config = instance_config print(f"Scaling up {service}.{instance_config.instance}") wrapper.update(kube_client)
def check_all_kubernetes_services_replication(soa_dir: str) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config, ) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=kubernetes_tools. KubernetesDeploymentConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )
def create_prometheus_adapter_config(paasta_cluster: str, soa_dir: Path) -> PrometheusAdapterConfig: """ Given a paasta cluster and a soaconfigs directory, create the necessary Prometheus adapter config to autoscale services. Currently supports the following metrics providers: * uwsgi """ rules: List[PrometheusAdapterRule] = [] # get_services_for_cluster() returns a list of (service, instance) tuples, but this # is not great for us: if we were to iterate over that we'd end up getting duplicates # for every service as PaastaServiceConfigLoader does not expose a way to get configs # for a single instance by name. instead, we get the unique set of service names and then # let PaastaServiceConfigLoader iterate over instances for us later services = { service_name for service_name, _ in get_services_for_cluster( cluster=paasta_cluster, instance_type="kubernetes", soa_dir=str(soa_dir)) } for service_name in services: config_loader = PaastaServiceConfigLoader(service=service_name, soa_dir=str(soa_dir)) for instance_config in config_loader.instance_configs( cluster=paasta_cluster, instance_type_class=KubernetesDeploymentConfig, ): rules.extend( get_rules_for_service_instance( service_name=service_name, instance_name=instance_config.instance, autoscaling_config=instance_config.get_autoscaling_params( ), paasta_cluster=paasta_cluster, )) return { # we sort our rules so that we can easily compare between two different configmaps # as otherwise we'd need to do fancy order-independent comparisons between the two # sets of rules later due to the fact that we're not iterating in a deterministic # way and can add rules in any arbitrary order "rules": sorted(rules, key=lambda rule: rule["name"]["as"]), }
def get_configs_of_services_to_scale( cluster: str, soa_dir: str = DEFAULT_SOA_DIR, services: Optional[Sequence[str]] = None, ) -> Sequence[MarathonServiceConfig]: if not services: services = list_services(soa_dir=soa_dir) configs = [] for service in services: service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=MarathonServiceConfig): if (instance_config.get_max_instances() and instance_config.get_desired_state() == "start" and instance_config.get_autoscaling_params()["decision_policy"] != "bespoke"): configs.append(instance_config) return configs
def check_services_replication( soa_dir: str, cluster: str, service_instances: Sequence[str], instance_type_class: Type[InstanceConfig_T], check_service_replication: CheckServiceReplication, replication_checker: ReplicationChecker, all_tasks_or_pods: Sequence[Union[MarathonTask, V1Pod]], dry_run: bool = False, ) -> Tuple[int, int]: service_instances_set = set(service_instances) replication_statuses: List[bool] = [] for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=instance_type_class): if (service_instances_set and f"{service}{SPACER}{instance_config.instance}" not in service_instances_set): continue if instance_config.get_docker_image(): is_well_replicated = check_service_replication( instance_config=instance_config, all_tasks_or_pods=all_tasks_or_pods, replication_checker=replication_checker, dry_run=dry_run, ) if is_well_replicated is not None: replication_statuses.append(is_well_replicated) else: log.debug( "%s is not deployed. Skipping replication monitoring." % instance_config.job_id) num_under_replicated = len( [status for status in replication_statuses if status is False]) return num_under_replicated, len(replication_statuses)
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config)) all_clients = clients.get_all_clients() all_tasks = [] for client in all_clients: all_tasks.extend(client.list_tasks()) mesos_slaves = a_sync.block(get_slaves) smartstack_replication_checker = MesosSmartstackReplicationChecker( mesos_slaves, system_paasta_config) for service in list_services(soa_dir=args.soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=args.soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=marathon_tools.MarathonServiceConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_tasks=all_tasks, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )
def check_all_kubernetes_based_services_replication( soa_dir: str, service_instances: Sequence[str], instance_type_class: Type[InstanceConfig_T], check_service_replication: CheckServiceReplication, namespace: str, ) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config) service_instances_set = set(service_instances) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=instance_type_class): if (service_instances_set and f"{service}{SPACER}{instance_config.instance}" not in service_instances_set): continue if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( "%s is not deployed. Skipping replication monitoring." % instance_config.job_id)
def check_services_replication( soa_dir: str, cluster: str, service_instances: Sequence[str], instance_type_class: Type[InstanceConfig_T], check_service_replication: CheckServiceReplication, replication_checker: SmartstackReplicationChecker, all_tasks_or_pods: Sequence[Union[MarathonTask, V1Pod]], ) -> float: service_instances_set = set(service_instances) replication_statuses: List[bool] = [] for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=instance_type_class): if (service_instances_set and f"{service}{SPACER}{instance_config.instance}" not in service_instances_set): continue if instance_config.get_docker_image(): is_well_replicated = check_service_replication( instance_config=instance_config, all_tasks_or_pods=all_tasks_or_pods, smartstack_replication_checker=replication_checker, ) if is_well_replicated is not None: replication_statuses.append(is_well_replicated) else: log.debug( "%s is not deployed. Skipping replication monitoring." % instance_config.job_id) return calculate_pct_under_replicated(replication_statuses)
def create_marathon_dashboard( cluster: str, soa_dir: str=DEFAULT_SOA_DIR, marathon_clients: MarathonClients=None, system_paasta_config: SystemPaastaConfig=None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False) dashboard_links: Dict = system_paasta_config.get_dashboard_links() marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO') # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com' shard_url_to_marathon_link_dict: Dict[str, str] = {} if isinstance(marathon_links, list): # Sanity check and log error if necessary if len(marathon_links) != len(marathon_servers.current): log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern') for shard_number, shard in enumerate(marathon_servers.current): shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number] elif isinstance(marathon_links, str): # In this case, the shard url will be the same for every service instance static_shard_url = marathon_links.split(' ')[0] return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]} # Setup with service as key since will instantiate 1 PSCL per service service_instances_dict: Dict[str, Set[str]] = defaultdict(set) for si in instances: service, instance = si[0], si[1] service_instances_dict[service].add(instance) for service, instance_set in service_instances_dict.items(): pscl = PaastaServiceConfigLoader( service=service, soa_dir=soa_dir, load_deployments=False, ) for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig): if marathon_service_config.get_instance() in instance_set: client: MarathonClient = \ marathon_clients.get_current_client_for_service(job_config=marathon_service_config) ip_url: str = client.servers[0] # Convert to a marathon link if possible else default to the originalIP address shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url) service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): # Currently only 'marathon' instances are supported for wait_for_deployment because they # are the only thing that are worth waiting on. service_configs = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir, load_deployments=False) total_instances = 0 clusters_data = [] api_endpoints = load_system_paasta_config().get_api_endpoints() for cluster in service_configs.clusters: if cluster not in api_endpoints: paasta_print( PaastaColors.red( 'Cluster %s is NOT in paasta-api endpoints config.' % cluster, )) raise NoSuchCluster instances_queue = Queue() for instance_config in service_configs.instance_configs( cluster=cluster, instance_type_class=MarathonServiceConfig, ): if instance_config.get_deploy_group() == deploy_group: instances_queue.put(instance_config) total_instances += 1 if not instances_queue.empty(): clusters_data.append( ClusterData( cluster=cluster, service=service, git_sha=git_sha, instances_queue=instances_queue, )) if not clusters_data: _log( service=service, component='deploy', line= ("Couldn't find any marathon instances for service {} in deploy group {}. Exiting." .format(service, deploy_group)), level='event', ) return paasta_print("Waiting for deployment of {} for '{}' to complete...".format( git_sha, deploy_group)) deadline = time.time() + timeout green_light = Event() green_light.set() with progressbar.ProgressBar(maxval=total_instances) as bar: while time.time() < deadline: _query_clusters(clusters_data, green_light) if not green_light.is_set(): raise KeyboardInterrupt bar.update(total_instances - sum((c.instances_queue.qsize() for c in clusters_data))) if all((cluster.instances_queue.empty() for cluster in clusters_data)): sys.stdout.flush() return 0 else: time.sleep(min(60, timeout)) sys.stdout.flush() _log( service=service, component='deploy', line=compose_timeout_message(clusters_data, timeout, deploy_group, service, git_sha), level='event', ) raise TimeoutError
def main() -> None: args = parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) instances = [] return_codes = [] command = args.command if (args.service_instance): service_instance = args.service_instance service, instance, _, __ = decompose_job_id(service_instance) instances.append(instance) elif (args.service and args.instances): service = args.service instances = args.instances.split(',') else: log.error( "The name of service or the name of instance to inspect is missing. Exiting." ) sys.exit(1) # Setting up transparent cache for http API calls requests_cache.install_cache("paasta_serviceinit", backend="memory") cluster = load_system_paasta_config().get_cluster() actual_deployments = get_actual_deployments(service, args.soa_dir) clients = PaastaClients(cached=(command == 'status')) instance_types = ['marathon', 'chronos', 'paasta_native', 'adhoc'] instance_types_map: Dict[str, List[str]] = {it: [] for it in instance_types} for instance in instances: try: instance_type = validate_service_instance( service, instance, cluster, args.soa_dir, ) except Exception: log.error( ('Exception raised while looking at service %s instance %s:' ).format(service, instance), ) log.error(traceback.format_exc()) return_codes.append(1) continue if instance_type not in instance_types: log.error( ("I calculated an instance_type of {} for {} which I don't " "know how to handle.").format( instance_type, compose_job_id(service, instance), ), ) return_codes.append(1) else: instance_types_map[instance_type].append(instance) remote_run_frameworks = None if len(instance_types_map['adhoc']) > 0: remote_run_frameworks = paasta_remote_run.remote_run_frameworks() service_config_loader = PaastaServiceConfigLoader(service) for instance_type in instance_types: if instance_type == 'marathon': job_configs = { jc.instance: jc for jc in service_config_loader.instance_configs( cluster=cluster, instance_type_class=marathon_tools.MarathonServiceConfig, ) } for instance in instance_types_map[instance_type]: try: version = get_deployment_version( actual_deployments, cluster, instance, ) paasta_print('instance: %s' % PaastaColors.blue(instance)) paasta_print('Git sha: %s (desired)' % version) if instance_type == 'marathon': return_code = marathon_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, app_id=args.app_id, clients=clients.marathon(), job_config=job_configs[instance], ) elif instance_type == 'chronos': return_code = chronos_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, client=clients.chronos(), ) elif instance_type == 'paasta_native': return_code = paasta_native_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, ) elif instance_type == 'adhoc': if command != 'status': raise NotImplementedError paasta_remote_run.remote_run_list_report( service=service, instance=instance, cluster=cluster, frameworks=remote_run_frameworks, ) return_code = 0 except Exception: log.error(('Exception raised while looking at service {} ' 'instance {}:').format(service, instance), ) log.error(traceback.format_exc()) return_code = 1 return_codes.append(return_code) sys.exit(max(return_codes))
def sync_boto_secrets( kube_client: KubeClient, cluster: str, service: str, secret_provider_name: str, vault_cluster_config: Mapping[str, str], soa_dir: str, namespace: str, ) -> bool: # Update boto key secrets config_loader = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in config_loader.instance_configs( cluster=cluster, instance_type_class=KubernetesDeploymentConfig): instance = instance_config.instance boto_keys = instance_config.config_dict.get("boto_keys", []) if not boto_keys: continue boto_keys.sort() secret_data = {} for key in boto_keys: for filetype in ["sh", "yaml", "json", "cfg"]: this_key = key + "." + filetype sanitised_key = this_key.replace(".", "-").replace("_", "--") try: with open(f"/etc/boto_cfg_private/{this_key}") as f: secret_data[sanitised_key] = base64.b64encode( f.read().encode("utf-8")).decode("utf-8") except IOError: log.warning( f"Boto key {this_key} required for {service} could not be found." ) if not secret_data: continue # In order to prevent slamming the k8s API, add some artificial delay here time.sleep(0.3) app_name = get_kubernetes_app_name(service, instance) secret = limit_size_with_hash(f"paasta-boto-key-{app_name}") hashable_data = "".join([secret_data[key] for key in secret_data]) signature = hashlib.sha1(hashable_data.encode("utf-8")).hexdigest() kubernetes_signature = get_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, namespace=namespace, ) if not kubernetes_signature: log.info( f"{secret} for {service} in {namespace} not found, creating") try: create_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) except ApiException as e: if e.status == 409: log.warning( f"Secret {secret} for {service} already exists in {namespace} but no signature found. Updating secret and signature." ) update_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) else: raise create_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, secret_signature=signature, namespace=namespace, ) elif signature != kubernetes_signature: log.info( f"{secret} for {service} in {namespace} needs updating as signature changed" ) update_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) update_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, secret_signature=signature, namespace=namespace, ) else: log.info(f"{secret} for {service} in {namespace} up to date") return True