def test_agent_failure(dcos_launchpad, cluster, vip_apps): # make sure the app works before starting @retry_boto_rate_limits def get_running_agents(group_name): return [i for i in dcos_launchpad.get_auto_scaling_instances(group_name) if i.state['Name'] == 'running'] test_util.helpers.wait_for_pong(vip_apps[0][1], 120) test_util.helpers.wait_for_pong(vip_apps[1][1], 10) agents = [i.instance_id for i in get_running_agents('PublicSlaveServerGroup') + get_running_agents('SlaveServerGroup')] # Agents are in auto-scaling groups, so they will automatically be replaced dcos_launchpad.boto_wrapper.client('ec2').terminate_instances(InstanceIds=agents) waiter = dcos_launchpad.boto_wrapper.client('ec2').get_waiter('instance_terminated') retry_boto_rate_limits(waiter.wait)(InstanceIds=agents) # Tell mesos the machines are "down" and not coming up so things get rescheduled. down_hosts = [{'hostname': slave, 'ip': slave} for slave in cluster.all_slaves] cluster.post( '/mesos/maintenance/schedule', json={'windows': [{ 'machine_ids': down_hosts, 'unavailability': {'start': {'nanoseconds': 0}} }]}).raise_for_status() cluster.post('/mesos/machine/down', json=down_hosts).raise_for_status() # Wait for replacements test_util.helpers.wait_for_len(partial(get_running_agents, 'SlaveServerGroup'), len(cluster.slaves), 600) test_util.helpers.wait_for_len( partial(get_running_agents, 'PublicSlaveServerGroup'), len(cluster.public_slaves), 600) # Reset the cluster to have the replacement agents cluster.slaves = sorted([agent.private_ip_address for agent in get_running_agents('SlaveServerGroup')]) cluster.public_slaves = sorted([agent.private_ip_address for agent in get_running_agents('PublicSlaveServerGroup')]) cluster.all_slaves = sorted(cluster.slaves + cluster.public_slaves) # verify that everything else is still working cluster.wait_for_dcos() # finally verify that the app is again running somewhere with its VIPs # Give marathon five minutes to deploy both the apps test_util.helpers.wait_for_pong(vip_apps[0][1], 300) test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
def test_agent_failure(dcos_launchpad, cluster, vip_apps): # make sure the app works before starting @retry_boto_rate_limits def get_running_agents(group_name): return [ i for i in dcos_launchpad.get_auto_scaling_instances(group_name) if i.state['Name'] == 'running' ] test_util.helpers.wait_for_pong(vip_apps[0][1], 120) test_util.helpers.wait_for_pong(vip_apps[1][1], 10) agents = [ i.instance_id for i in get_running_agents('PublicSlaveServerGroup') + get_running_agents('SlaveServerGroup') ] # Agents are in autoscaling groups, so they will automatically be replaced dcos_launchpad.boto_wrapper.client('ec2').terminate_instances( InstanceIds=agents) waiter = dcos_launchpad.boto_wrapper.client('ec2').get_waiter( 'instance_terminated') retry_boto_rate_limits(waiter.wait)(InstanceIds=agents) # Tell mesos the machines are "down" and not coming up so things get rescheduled. down_hosts = [{ 'hostname': slave, 'ip': slave } for slave in cluster.all_slaves] cluster.post('/mesos/maintenance/schedule', json={ 'windows': [{ 'machine_ids': down_hosts, 'unavailability': { 'start': { 'nanoseconds': 0 } } }] }).raise_for_status() cluster.post('/mesos/machine/down', json=down_hosts).raise_for_status() # Wait for replacements test_util.helpers.wait_for_len( partial(get_running_agents, 'SlaveServerGroup'), len(cluster.slaves), 600) test_util.helpers.wait_for_len( partial(get_running_agents, 'PublicSlaveServerGroup'), len(cluster.public_slaves), 600) # Reset the cluster to have the replacement agents cluster.slaves = sorted([ agent.private_ip_address for agent in get_running_agents('SlaveServerGroup') ]) cluster.public_slaves = sorted([ agent.private_ip_address for agent in get_running_agents('PublicSlaveServerGroup') ]) cluster.all_slaves = sorted(cluster.slaves + cluster.public_slaves) # verify that everything else is still working cluster.wait_for_dcos() # finally verify that the app is again running somewhere with its VIPs # Give marathon five minutes to deploy both the apps test_util.helpers.wait_for_pong(vip_apps[0][1], 300) test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
def test_agent_failure(dcos_stack, boto_wrapper, dcos_api_session, vip_apps): # Accessing AWS Resource objects will trigger a client describe call. # As such, any method that touches AWS APIs must be wrapped to avoid # CI collapse when rate limits are inevitably reached @retry_boto_rate_limits def get_running_instances(instance_iter): return [i for i in instance_iter if i.state['Name'] == 'running'] @retry_boto_rate_limits def get_instance_ids(instance_iter): return [i.instance_id for i in instance_iter] @retry_boto_rate_limits def get_private_ips(instance_iter): return sorted([i.private_ip_address for i in get_running_instances(instance_iter)]) # make sure the app works before starting test_util.helpers.wait_for_pong(vip_apps[0][1], 120) test_util.helpers.wait_for_pong(vip_apps[1][1], 10) agent_ids = get_instance_ids( get_running_instances(dcos_stack.public_agent_instances) + get_running_instances(dcos_stack.private_agent_instances)) # Agents are in auto-scaling groups, so they will automatically be replaced boto_wrapper.client('ec2').terminate_instances(InstanceIds=agent_ids) waiter = boto_wrapper.client('ec2').get_waiter('instance_terminated') retry_boto_rate_limits(waiter.wait)(InstanceIds=agent_ids) # Tell mesos the machines are "down" and not coming up so things get rescheduled. down_hosts = [{'hostname': slave, 'ip': slave} for slave in dcos_api_session.all_slaves] dcos_api_session.post( '/mesos/maintenance/schedule', json={'windows': [{ 'machine_ids': down_hosts, 'unavailability': {'start': {'nanoseconds': 0}} }]}).raise_for_status() dcos_api_session.post('/mesos/machine/down', json=down_hosts).raise_for_status() public_agent_count = len(dcos_api_session.public_slaves) private_agent_count = len(dcos_api_session.slaves) @retrying.retry( wait_fixed=60 * 1000, retry_on_result=lambda res: res is False, stop_max_delay=900 * 1000) def wait_for_agents_to_refresh(): public_agents = get_running_instances(dcos_stack.public_agent_instances) if len(public_agents) == public_agent_count: dcos_api_session.public_slaves = get_private_ips(public_agents) else: log.info('Waiting for {} public agents. Current: {}'.format( public_agent_count, len(public_agents))) return False private_agents = get_running_instances(dcos_stack.private_agent_instances) if len(private_agents) == private_agent_count: dcos_api_session.slaves = get_private_ips(private_agents) else: log.info('Waiting for {} private agents. Current: {}'.format( private_agent_count, len(private_agents))) return False dcos_api_session.all_slaves = sorted( dcos_api_session.slaves + dcos_api_session.public_slaves) wait_for_agents_to_refresh() # verify that everything else is still working dcos_api_session.wait_for_dcos() # finally verify that the app is again running somewhere with its VIPs # Give marathon five minutes to deploy both the apps test_util.helpers.wait_for_pong(vip_apps[0][1], 300) test_util.helpers.wait_for_pong(vip_apps[1][1], 10)