def create_marathon_dashboard( cluster: str, soa_dir: str = DEFAULT_SOA_DIR, marathon_clients: MarathonClients = None, system_paasta_config: SystemPaastaConfig = None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers( system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients( marathon_servers=marathon_servers, cached=False) for service_instance in instances: service: str = service_instance[0] instance: str = service_instance[1] service_config: MarathonServiceConfig = load_marathon_service_config( service=service, instance=instance, cluster=cluster, load_deployments=False, soa_dir=soa_dir, ) client: MarathonClient = marathon_clients.get_current_client_for_service( job_config=service_config) dashboard_links: Dict = system_paasta_config.get_dashboard_links() shard_url: str = client.servers[0] if 'Marathon RO' in dashboard_links[cluster]: marathon_links = dashboard_links[cluster]['Marathon RO'] if isinstance(marathon_links, list): for shard_number, shard in enumerate(marathon_servers.current): if shard.url[0] == shard_url: shard_url = marathon_links[shard_number] elif isinstance(marathon_links, str): shard_url = marathon_links.split(' ')[0] service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def do_bounce( bounce_func: bounce_lib.BounceMethod, drain_method: drain_lib.DrainMethod, config: marathon_tools.FormattedMarathonAppDict, new_app_running: bool, happy_new_tasks: List[Tuple[MarathonTask, MarathonClient]], old_app_live_happy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_live_unhappy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_draining_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_at_risk_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], service: str, bounce_method: str, serviceinstance: str, cluster: str, instance: str, marathon_jobid: str, clients: marathon_tools.MarathonClients, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Optional[float]: def log_bounce_action(line: str, level: str = 'debug') -> None: return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance, ) # log if we're not in a steady state. if any([ (not new_app_running), old_app_live_happy_tasks.keys(), ]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old tasks at risk.' % len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: log.debug("Nothing to do, bounce is in a steady state") new_client = clients.get_current_client_for_service(job_config) old_non_draining_tasks = list( old_app_tasks_to_task_client_pairs(old_app_live_happy_tasks), ) + list( old_app_tasks_to_task_client_pairs(old_app_live_unhappy_tasks), ) + list(old_app_tasks_to_task_client_pairs(old_app_at_risk_tasks), ) actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_non_draining_tasks=old_non_draining_tasks, margin_factor=bounce_margin_factor, ) if actions['create_app'] and not new_app_running: log_bounce_action(line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) with requests_cache.disabled(): try: bounce_lib.create_marathon_app( app_id=marathon_jobid, config=config, client=new_client, ) except MarathonHttpError as e: if e.status_code == 409: log.warning( "Failed to create, app %s already exists. This means another bounce beat us to it." " Skipping the rest of the bounce for this run" % marathon_jobid, ) return 60 raise tasks_to_kill = drain_tasks_and_find_tasks_to_kill( tasks_to_drain=actions['tasks_to_drain'], already_draining_tasks=old_app_tasks_to_task_client_pairs( old_app_draining_tasks), drain_method=drain_method, log_bounce_action=log_bounce_action, bounce_method=bounce_method, at_risk_tasks=old_app_tasks_to_task_client_pairs( old_app_at_risk_tasks), ) tasks_to_kill_by_client: Dict[MarathonClient, List[MarathonTask]] = defaultdict(list) for task, client in tasks_to_kill: tasks_to_kill_by_client[client].append(task) for client, tasks in tasks_to_kill_by_client.items(): kill_given_tasks(client=client, task_ids=[task.id for task in tasks], scale=True) for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks): if task in tasks_to_kill: hostname = task.host try: reserve_all_resources([hostname]) except HTTPError: log.warning("Failed to reserve resources on %s" % hostname) apps_to_kill: List[Tuple[str, MarathonClient]] = [] for app, client in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid or client != new_client: live_happy_tasks = old_app_live_happy_tasks[(app, client)] live_unhappy_tasks = old_app_live_unhappy_tasks[(app, client)] draining_tasks = old_app_draining_tasks[(app, client)] at_risk_tasks = old_app_at_risk_tasks[(app, client)] remaining_tasks = (live_happy_tasks | live_unhappy_tasks | draining_tasks | at_risk_tasks) for task, _ in tasks_to_kill: remaining_tasks.discard(task) if 0 == len(remaining_tasks): apps_to_kill.append((app, client)) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % ( bounce_method, ', '.join([app for app, client in apps_to_kill]), ), ) with requests_cache.disabled(): for app_id, client in apps_to_kill: bounce_lib.kill_old_ids([app_id], client) all_old_tasks: Set[MarathonTask] = set() all_old_tasks = set.union(all_old_tasks, *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values()) if all_old_tasks or (not new_app_running): # Still have work more work to do, try again in 60 seconds return 60 else: # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == list(old_app_live_happy_tasks), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % ( bounce_method, serviceinstance, marathon_jobid, ), level='event', ) return None
def create_marathon_dashboard( cluster: str, soa_dir: str=DEFAULT_SOA_DIR, marathon_clients: MarathonClients=None, system_paasta_config: SystemPaastaConfig=None, ) -> Marathon_Dashboard: try: instances: List = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) except FileNotFoundError: instances = [] dashboard: Marathon_Dashboard = {cluster: []} if system_paasta_config is None: system_paasta_config = load_system_paasta_config() marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config) if marathon_clients is None: marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False) dashboard_links: Dict = system_paasta_config.get_dashboard_links() marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO') # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com' shard_url_to_marathon_link_dict: Dict[str, str] = {} if isinstance(marathon_links, list): # Sanity check and log error if necessary if len(marathon_links) != len(marathon_servers.current): log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern') for shard_number, shard in enumerate(marathon_servers.current): shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number] elif isinstance(marathon_links, str): # In this case, the shard url will be the same for every service instance static_shard_url = marathon_links.split(' ')[0] return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]} # Setup with service as key since will instantiate 1 PSCL per service service_instances_dict: Dict[str, Set[str]] = defaultdict(set) for si in instances: service, instance = si[0], si[1] service_instances_dict[service].add(instance) for service, instance_set in service_instances_dict.items(): pscl = PaastaServiceConfigLoader( service=service, soa_dir=soa_dir, load_deployments=False, ) for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig): if marathon_service_config.get_instance() in instance_set: client: MarathonClient = \ marathon_clients.get_current_client_for_service(job_config=marathon_service_config) ip_url: str = client.servers[0] # Convert to a marathon link if possible else default to the originalIP address shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url) service_info: Marathon_Dashboard_Item = { 'service': service, 'instance': instance, 'shard_url': shard_url, } dashboard[cluster].append(service_info) return dashboard
def perform_command( command: str, service: str, instance: str, cluster: str, verbose: int, soa_dir: str, clients: marathon_tools.MarathonClients, job_config: marathon_tools.MarathonServiceConfig, app_id: str = None, ) -> int: """Performs a start/stop/restart/status on an instance :param command: String of start, stop, restart, status :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: int verbosity level :param client: MarathonClient or CachingMarathonClient :returns: A unix-style return code """ system_config = load_system_paasta_config() if not app_id: try: app_id = job_config.format_marathon_app_dict()["id"] except NoDockerImageError: job_id = compose_job_id(service, instance) paasta_print( "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id ) return 1 normal_instance_count = job_config.get_instances() current_client = clients.get_current_client_for_service(job_config) if command == "restart": restart_marathon_job(service, instance, app_id, current_client, cluster) elif command == "status": paasta_print( status_desired_state(service, instance, current_client, job_config) ) dashboards = get_marathon_dashboard_links(clients, system_config) tasks, out = status_marathon_job( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, dashboards=dashboards, normal_instance_count=normal_instance_count, clients=clients, job_config=job_config, desired_app_id=app_id, verbose=verbose, ) paasta_print(out) service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir ) paasta_print( status_mesos_tasks(service, instance, normal_instance_count, verbose) ) proxy_port = service_namespace_config.get("proxy_port") if proxy_port is not None: normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service, instance, cluster ) paasta_print( status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, service_namespace_config=service_namespace_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose > 0, synapse_port=system_config.get_synapse_port(), synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(), system_deploy_blacklist=system_config.get_deploy_blacklist(), system_deploy_whitelist=system_config.get_deploy_whitelist(), ) ) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0