def test_get_draining_hosts( mock_get_hosts_with_state, ): get_draining_hosts() assert mock_get_hosts_with_state.call_count == 1 expected_args = mock.call(state='draining_machines') assert mock_get_hosts_with_state.call_args == expected_args
def get_service_instances_that_need_bouncing(marathon_client, soa_dir): desired_marathon_configs = get_desired_marathon_configs(soa_dir) desired_ids = set(desired_marathon_configs.keys()) current_apps = { app.id.lstrip('/'): app for app in marathon_client.list_apps() } actual_ids = set(current_apps.keys()) apps_that_need_bouncing = actual_ids.symmetric_difference(desired_ids) apps_that_need_bouncing = { long_job_id_to_short_job_id(app_id) for app_id in apps_that_need_bouncing } draining_hosts = get_draining_hosts() for app_id, app in current_apps.items(): short_app_id = long_job_id_to_short_job_id(app_id) if short_app_id not in apps_that_need_bouncing: if (app.instances != desired_marathon_configs[app_id]['instances'] or get_num_at_risk_tasks(app, draining_hosts) != 0): apps_that_need_bouncing.add(short_app_id) return (app_id.replace('--', '_') for app_id in apps_that_need_bouncing)
def get_service_instances_that_need_bouncing(marathon_clients, soa_dir): desired_marathon_configs_formatted, desired_job_configs = get_desired_marathon_configs( soa_dir) desired_ids_and_clients = set() for app_id, job_config in desired_job_configs.items(): desired_ids_and_clients.add( (app_id, marathon_clients.get_current_client_for_service(job_config))) current_apps_with_clients = { (app.id.lstrip('/'), client): app for app, client in get_marathon_apps_with_clients( marathon_clients.get_all_clients()) } actual_ids_and_clients = set(current_apps_with_clients.keys()) undesired_apps_and_clients = actual_ids_and_clients.symmetric_difference( desired_ids_and_clients) apps_that_need_bouncing = { long_job_id_to_short_job_id(app_id) for app_id, client in undesired_apps_and_clients } draining_hosts = get_draining_hosts() for (app_id, client), app in current_apps_with_clients.items(): short_app_id = long_job_id_to_short_job_id(app_id) if short_app_id not in apps_that_need_bouncing: if (app.instances != desired_marathon_configs_formatted[app_id]['instances'] or get_num_at_risk_tasks(app, draining_hosts) != 0): apps_that_need_bouncing.add(short_app_id) return (app_id.replace('--', '_') for app_id in apps_that_need_bouncing)
def get_tasks_by_state_for_app(app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config): tasks_by_state = { 'happy': set(), 'unhappy': set(), 'draining': set(), 'at_risk': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) draining_hosts = get_draining_hosts() for task in app.tasks: if drain_method.is_draining(task): state = 'draining' elif task in happy_tasks: if task.host in draining_hosts: state = 'at_risk' else: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) return tasks_by_state
def reserve_all_resources_on_draining_hosts(): """Reserve all resources on draining hosts""" log.debug("Reserving all resources on draining hosts") draining_hosts = get_draining_hosts() if draining_hosts: reserve_all_resources(hostnames=draining_hosts) else: log.debug("No draining hosts")
def unreserve_all_resources_on_non_draining_hosts(): """Unreserve all resources on non-draining hosts""" log.debug("Unreserving all resources on non-draining hosts") slaves = get_slaves() hostnames = [slave['hostname'] for slave in slaves] draining_hosts = get_draining_hosts() non_draining_hosts = list(set(hostnames) - set(draining_hosts)) unreserve_all_resources(hostnames=non_draining_hosts)
def unreserve_all_resources_on_non_draining_hosts(): """Unreserve all resources on non-draining hosts""" log.debug("Unreserving all resources on non-draining hosts") slaves = get_slaves() hostnames = [slave['hostname'] for slave in slaves] draining_hosts = get_draining_hosts() non_draining_hosts = list(set(hostnames) - set(draining_hosts)) if non_draining_hosts: unreserve_all_resources(hostnames=non_draining_hosts) else: log.debug("No non-draining hosts")
def get_new_draining_hosts(self): try: draining_hosts = get_draining_hosts() except RequestException as e: self.log.error(f"Unable to get list of draining hosts from mesos: {e}") draining_hosts = list(self.draining) new_draining_hosts = [host for host in draining_hosts if host not in self.draining] for host in new_draining_hosts: self.draining.add(host) hosts_finished_draining = [host for host in self.draining if host not in draining_hosts] for host in hosts_finished_draining: self.draining.remove(host) return new_draining_hosts
def run(self): self.is_ready = True while True: draining_hosts = get_draining_hosts() new_draining_hosts = [host for host in draining_hosts if host not in self.draining] self.draining = draining_hosts service_instances = [] if new_draining_hosts: self.log.info("Found new draining hosts: {}".format(new_draining_hosts)) service_instances = self.get_at_risk_service_instances(new_draining_hosts) for service_instance in service_instances: self.inbox_q.put(service_instance) time.sleep(20)
def get_num_at_risk_tasks(app): """Determine how many of an application's tasks are running on at-risk (Mesos Maintenance Draining) hosts. :param app: A marathon application :returns: An integer representing the number of tasks running on at-risk hosts """ hosts_tasks_running_on = [task.host for task in app.tasks] draining_hosts = get_draining_hosts() num_at_risk_tasks = 0 for host in hosts_tasks_running_on: if host in draining_hosts: num_at_risk_tasks += 1 log.debug("%s has %d tasks running on at-risk hosts." % (app.id, num_at_risk_tasks)) return num_at_risk_tasks
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def deploy_service( service, instance, marathon_jobid, config, client, marathon_apps, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) (old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps=other_apps, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set(scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % (service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % (service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def reserve_all_resources_on_draining_hosts(): """Reserve all resources on draining hosts""" log.debug("Reserving all resources on draining hosts") reserve_all_resources(hostnames=get_draining_hosts())
def test_get_draining_hosts(mock_get_hosts_with_state, ): get_draining_hosts() assert mock_get_hosts_with_state.call_count == 1 expected_args = mock.call(state="draining_machines", system_paasta_config=None) assert mock_get_hosts_with_state.call_args == expected_args