def test_flatten_tasks(self): """Simple check of flatten_tasks.""" all_tasks = [mock.Mock(task_id="id_%d" % i) for i in range(10)] expected = set(all_tasks) actual = bounce_lib.flatten_tasks( {"app_id_1": set(all_tasks[:5]), "app_id_2": set(all_tasks[5:])} ) assert actual == expected
def test_flatten_tasks(self): """Simple check of flatten_tasks.""" all_tasks = [mock.Mock(task_id='id_%d' % i) for i in range(10)] expected = set(all_tasks) actual = bounce_lib.flatten_tasks({ 'app_id_1': set(all_tasks[:5]), 'app_id_2': set(all_tasks[5:]) }) assert actual == expected
def do_bounce( bounce_func, drain_method, config, new_app_running, happy_new_tasks, old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, service, bounce_method, serviceinstance, cluster, instance, marathon_jobid, client, soa_dir, ): def log_bounce_action(line, level='debug'): return _log(service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance) # log if we're not in a steady state. if any([(not new_app_running), old_app_live_happy_tasks.keys()]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: # In a steady state. Let's let Sensu know everything is fine. send_sensu_bounce_keepalive( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) all_draining_tasks = set() actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, ) if actions['create_app'] and not new_app_running: log_bounce_action(line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) bounce_lib.create_marathon_app(marathon_jobid, config, client) if len(actions['tasks_to_drain']) > 0: tasks_to_drain_by_app_id = defaultdict(set) for task in actions['tasks_to_drain']: tasks_to_drain_by_app_id[task.app_id].add(task) for app_id, tasks in tasks_to_drain_by_app_id.items(): log_bounce_action( line='%s bounce draining %d old tasks with app_id %s' % (bounce_method, len(tasks), app_id), ) for task in actions['tasks_to_drain']: all_draining_tasks.add(task) drain_method.drain(task) for app, tasks in old_app_draining_tasks.items(): for task in tasks: all_draining_tasks.add(task) tasks_to_kill = set() for task in all_draining_tasks: if drain_method.is_safe_to_kill(task): tasks_to_kill.add(task) log_bounce_action(line='%s bounce killing drained task %s' % (bounce_method, task.id)) kill_given_tasks(client=client, task_ids=[task.id for task in tasks_to_kill], scale=True) apps_to_kill = [] for app in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid: live_happy_tasks = old_app_live_happy_tasks[app] live_unhappy_tasks = old_app_live_unhappy_tasks[app] draining_tasks = old_app_draining_tasks[app] if 0 == len((live_happy_tasks | live_unhappy_tasks | draining_tasks) - tasks_to_kill): apps_to_kill.append(app) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % (bounce_method, ', '.join(apps_to_kill)), ) bounce_lib.kill_old_ids(apps_to_kill, client) all_old_tasks = set.union(set(), *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == old_app_live_happy_tasks.keys(), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % (bounce_method, serviceinstance, marathon_jobid), level='event', )
def do_bounce( bounce_func: bounce_lib.BounceMethod, drain_method: drain_lib.DrainMethod, config: marathon_tools.FormattedMarathonAppDict, new_app_running: bool, happy_new_tasks: List[Tuple[MarathonTask, MarathonClient]], old_app_live_happy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_live_unhappy_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_draining_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], old_app_at_risk_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]], service: str, bounce_method: str, serviceinstance: str, cluster: str, instance: str, marathon_jobid: str, clients: marathon_tools.MarathonClients, soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Optional[float]: def log_bounce_action(line: str, level: str = 'debug') -> None: return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance, ) # log if we're not in a steady state. if any([ (not new_app_running), old_app_live_happy_tasks.keys(), ]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old tasks at risk.' % len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: log.debug("Nothing to do, bounce is in a steady state") new_client = clients.get_current_client_for_service(job_config) old_non_draining_tasks = list( old_app_tasks_to_task_client_pairs(old_app_live_happy_tasks), ) + list( old_app_tasks_to_task_client_pairs(old_app_live_unhappy_tasks), ) + list(old_app_tasks_to_task_client_pairs(old_app_at_risk_tasks), ) actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_non_draining_tasks=old_non_draining_tasks, margin_factor=bounce_margin_factor, ) if actions['create_app'] and not new_app_running: log_bounce_action(line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) with requests_cache.disabled(): try: bounce_lib.create_marathon_app( app_id=marathon_jobid, config=config, client=new_client, ) except MarathonHttpError as e: if e.status_code == 409: log.warning( "Failed to create, app %s already exists. This means another bounce beat us to it." " Skipping the rest of the bounce for this run" % marathon_jobid, ) return 60 raise tasks_to_kill = drain_tasks_and_find_tasks_to_kill( tasks_to_drain=actions['tasks_to_drain'], already_draining_tasks=old_app_tasks_to_task_client_pairs( old_app_draining_tasks), drain_method=drain_method, log_bounce_action=log_bounce_action, bounce_method=bounce_method, at_risk_tasks=old_app_tasks_to_task_client_pairs( old_app_at_risk_tasks), ) tasks_to_kill_by_client: Dict[MarathonClient, List[MarathonTask]] = defaultdict(list) for task, client in tasks_to_kill: tasks_to_kill_by_client[client].append(task) for client, tasks in tasks_to_kill_by_client.items(): kill_given_tasks(client=client, task_ids=[task.id for task in tasks], scale=True) for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks): if task in tasks_to_kill: hostname = task.host try: reserve_all_resources([hostname]) except HTTPError: log.warning("Failed to reserve resources on %s" % hostname) apps_to_kill: List[Tuple[str, MarathonClient]] = [] for app, client in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid or client != new_client: live_happy_tasks = old_app_live_happy_tasks[(app, client)] live_unhappy_tasks = old_app_live_unhappy_tasks[(app, client)] draining_tasks = old_app_draining_tasks[(app, client)] at_risk_tasks = old_app_at_risk_tasks[(app, client)] remaining_tasks = (live_happy_tasks | live_unhappy_tasks | draining_tasks | at_risk_tasks) for task, _ in tasks_to_kill: remaining_tasks.discard(task) if 0 == len(remaining_tasks): apps_to_kill.append((app, client)) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % ( bounce_method, ', '.join([app for app, client in apps_to_kill]), ), ) with requests_cache.disabled(): for app_id, client in apps_to_kill: bounce_lib.kill_old_ids([app_id], client) all_old_tasks: Set[MarathonTask] = set() all_old_tasks = set.union(all_old_tasks, *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values()) if all_old_tasks or (not new_app_running): # Still have work more work to do, try again in 60 seconds return 60 else: # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == list(old_app_live_happy_tasks), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % ( bounce_method, serviceinstance, marathon_jobid, ), level='event', ) return None
def do_bounce( bounce_func, drain_method, config, new_app_running, happy_new_tasks, old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, service, bounce_method, serviceinstance, cluster, instance, marathon_jobid, client, soa_dir, ): def log_bounce_action(line, level='debug'): return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance ) # log if we're not in a steady state. if any([ (not new_app_running), old_app_live_happy_tasks.keys() ]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: # In a steady state. Let's let Sensu know everything is fine. send_sensu_bounce_keepalive( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) all_draining_tasks = set() actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, ) if actions['create_app'] and not new_app_running: log_bounce_action( line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) bounce_lib.create_marathon_app(marathon_jobid, config, client) if len(actions['tasks_to_drain']) > 0: tasks_to_drain_by_app_id = defaultdict(set) for task in actions['tasks_to_drain']: tasks_to_drain_by_app_id[task.app_id].add(task) for app_id, tasks in tasks_to_drain_by_app_id.items(): log_bounce_action( line='%s bounce draining %d old tasks with app_id %s' % (bounce_method, len(tasks), app_id), ) for task in actions['tasks_to_drain']: all_draining_tasks.add(task) drain_method.drain(task) for app, tasks in old_app_draining_tasks.items(): for task in tasks: all_draining_tasks.add(task) tasks_to_kill = set() for task in all_draining_tasks: if drain_method.is_safe_to_kill(task): tasks_to_kill.add(task) log_bounce_action(line='%s bounce killing drained task %s' % (bounce_method, task.id)) client.kill_given_tasks(task_ids=[task.id for task in tasks_to_kill], scale=True) apps_to_kill = [] for app in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid: live_happy_tasks = old_app_live_happy_tasks[app] live_unhappy_tasks = old_app_live_unhappy_tasks[app] draining_tasks = old_app_draining_tasks[app] if 0 == len((live_happy_tasks | live_unhappy_tasks | draining_tasks) - tasks_to_kill): apps_to_kill.append(app) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % ( bounce_method, ', '.join(apps_to_kill) ), ) bounce_lib.kill_old_ids(apps_to_kill, client) all_old_tasks = set.union(set(), *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == old_app_live_happy_tasks.keys(), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % ( bounce_method, serviceinstance, marathon_jobid ), level='event', )
def do_bounce( bounce_func, drain_method, config, new_app_running, happy_new_tasks, old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, service, bounce_method, serviceinstance, cluster, instance, marathon_jobid, client, soa_dir, bounce_margin_factor=1.0, ): def log_bounce_action(line, level='debug'): return _log(service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance) # log if we're not in a steady state. if any([(not new_app_running), old_app_live_happy_tasks.keys()]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old tasks at risk.' % len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: log.debug("Nothing to do, bounce is in a steady state") actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, margin_factor=bounce_margin_factor, ) if actions['create_app'] and not new_app_running: log_bounce_action(line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) with requests_cache.disabled(): try: bounce_lib.create_marathon_app(marathon_jobid, config, client) except MarathonHttpError as e: if e.status_code == 409: log.warning( "Failed to create, app %s already exists. This means another bounce beat us to it." " Skipping the rest of the bounce for this run" % marathon_jobid) return raise tasks_to_kill = drain_tasks_and_find_tasks_to_kill( tasks_to_drain=actions['tasks_to_drain'], already_draining_tasks=bounce_lib.flatten_tasks( old_app_draining_tasks), drain_method=drain_method, log_bounce_action=log_bounce_action, bounce_method=bounce_method, at_risk_tasks=bounce_lib.flatten_tasks(old_app_at_risk_tasks), ) kill_given_tasks(client=client, task_ids=[task.id for task in tasks_to_kill], scale=True) for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks): if task in tasks_to_kill: hostname = task.host try: reserve_all_resources([hostname]) except HTTPError: log.warning("Failed to reserve resources on %s" % hostname) apps_to_kill = [] for app in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid: live_happy_tasks = old_app_live_happy_tasks[app] live_unhappy_tasks = old_app_live_unhappy_tasks[app] draining_tasks = old_app_draining_tasks[app] at_risk_tasks = old_app_at_risk_tasks[app] if 0 == len((live_happy_tasks | live_unhappy_tasks | draining_tasks | at_risk_tasks) - tasks_to_kill): apps_to_kill.append(app) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % (bounce_method, ', '.join(apps_to_kill)), ) with requests_cache.disabled(): bounce_lib.kill_old_ids(apps_to_kill, client) all_old_tasks = set.union(set(), *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values()) # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == list(old_app_live_happy_tasks), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % (bounce_method, serviceinstance, marathon_jobid), level='event', )
def do_bounce( bounce_func, drain_method, config, new_app_running, happy_new_tasks, old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, service, bounce_method, serviceinstance, cluster, instance, marathon_jobid, client, soa_dir, bounce_margin_factor=1.0, ): def log_bounce_action(line, level='debug'): return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance ) # log if we're not in a steady state. if any([ (not new_app_running), old_app_live_happy_tasks.keys() ]): log_bounce_action( line=' '.join([ '%s bounce in progress on %s.' % (bounce_method, serviceinstance), 'New marathon app %s %s.' % (marathon_jobid, ('exists' if new_app_running else 'not created yet')), '%d new tasks to bring up.' % (config['instances'] - len(happy_new_tasks)), '%d old tasks receiving traffic and happy.' % len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)), '%d old tasks unhappy.' % len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)), '%d old tasks draining.' % len(bounce_lib.flatten_tasks(old_app_draining_tasks)), '%d old tasks at risk.' % len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)), '%d old apps.' % len(old_app_live_happy_tasks.keys()), ]), level='event', ) else: log.debug("Nothing to do, bounce is in a steady state") actions = bounce_func( new_config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, margin_factor=bounce_margin_factor, ) if actions['create_app'] and not new_app_running: log_bounce_action( line='%s bounce creating new app with app_id %s' % (bounce_method, marathon_jobid), ) with requests_cache.disabled(): bounce_lib.create_marathon_app(marathon_jobid, config, client) tasks_to_kill = drain_tasks_and_find_tasks_to_kill( tasks_to_drain=actions['tasks_to_drain'], already_draining_tasks=bounce_lib.flatten_tasks(old_app_draining_tasks), drain_method=drain_method, log_bounce_action=log_bounce_action, bounce_method=bounce_method, at_risk_tasks=bounce_lib.flatten_tasks(old_app_at_risk_tasks), ) kill_given_tasks(client=client, task_ids=[task.id for task in tasks_to_kill], scale=True) for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks): if task in tasks_to_kill: hostname = task.host reserve_all_resources([hostname]) apps_to_kill = [] for app in old_app_live_happy_tasks.keys(): if app != '/%s' % marathon_jobid: live_happy_tasks = old_app_live_happy_tasks[app] live_unhappy_tasks = old_app_live_unhappy_tasks[app] draining_tasks = old_app_draining_tasks[app] at_risk_tasks = old_app_at_risk_tasks[app] if 0 == len((live_happy_tasks | live_unhappy_tasks | draining_tasks | at_risk_tasks) - tasks_to_kill): apps_to_kill.append(app) if apps_to_kill: log_bounce_action( line='%s bounce removing old unused apps with app_ids: %s' % ( bounce_method, ', '.join(apps_to_kill) ), ) with requests_cache.disabled(): bounce_lib.kill_old_ids(apps_to_kill, client) all_old_tasks = set.union(set(), *old_app_live_happy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_live_unhappy_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values()) all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values()) # log if we appear to be finished if all([ (apps_to_kill or tasks_to_kill), apps_to_kill == old_app_live_happy_tasks.keys(), tasks_to_kill == all_old_tasks, ]): log_bounce_action( line='%s bounce on %s finishing. Now running %s' % ( bounce_method, serviceinstance, marathon_jobid ), level='event', )