def storage_server(docker_client, container_network): bootstrap_script = "/twindb-backup/support/bootstrap/storage_server.sh" container = get_container( "storage_server", docker_client, container_network, bootstrap_script=bootstrap_script, image="centos:centos7", last_n=3, ) timeout = time.time() + 30 * 60 while time.time() < timeout: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if sock.connect_ex((container["ip"], 22)) == 0: break time.sleep(1) yield container if container: LOG.info("Removing container %s", container["Id"]) docker_client.api.remove_container(container=container["Id"], force=True)
def storage_server(docker_client, container_network): bootstrap_script = '/twindb-backup/support/bootstrap/storage_server.sh' container = get_container( 'storage_server', docker_client, container_network, bootstrap_script=bootstrap_script, image="centos:centos7", last_n=2 ) timeout = time.time() + 30 * 60 while time.time() < timeout: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if sock.connect_ex((container['ip'], 22)) == 0: break time.sleep(1) yield container if container: LOG.info('Removing container %s', container['Id']) docker_client.api.remove_container(container=container['Id'], force=True)
def test__update_with_host_maintenance_and_agent_down(stateless_job, maintenance): """ 1. Create a large stateless job (that take up more than two-thirds of the cluster resources) with MaximumUnavailableInstances=2. 2. Start host maintenance on one of the hosts (say A) having pods of the job. MaximumUnavailableInstances=2 ensures that not more than 2 pods are unavailable due to host maintenance at a time. 3. Take down another host which has pods running on it. This will TASK_LOST to be sent for all pods on the host after 75 seconds. 4. Start an update to modify the instance spec of one of the pods. 5. Since TASK_LOST would cause the job SLA to be violated, instances on the host A should not be killed once LOST event is received. Verify that host A does not transition to DOWN. """ stateless_job.job_spec.instance_count = 30 stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3 stateless_job.job_spec.sla.maximum_unavailable_instances = 2 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] # pick another host which has pods of the job to take down host_container = get_container([sorted_hosts[1]]) try: host_container.stop() maintenance["start"]([test_host]) stateless_job.job_spec.instance_spec[10].containers.extend([ pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec( disk_limit_mb=20)) ]) update = StatelessUpdate(stateless_job, updated_job_spec=stateless_job.job_spec, batch_size=0) update.create() update.wait_for_state(goal_state="SUCCEEDED") stateless_job.stop() wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING) pass finally: host_container.start()
def master2(docker_client, container_network): bootstrap_script = '/twindb-backup/support/bootstrap/master2.sh' container = get_container('master2', bootstrap_script, docker_client, container_network, 2) timeout = time.time() + 30 * 60 while time.time() < timeout: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if sock.connect_ex((container['ip'], 22)) == 0: break time.sleep(1) yield container if container: LOG.info('Removing container %s', container['Id']) docker_client.api.remove_container(container=container['Id'], force=True)
def master1(docker_client, container_network): bootstrap_script = '/twindb-backup/support/bootstrap/master1.sh' container = get_container('master1', bootstrap_script, docker_client, container_network, 1) timeout = time.time() + 30 * 60 while time.time() < timeout: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if sock.connect_ex((container['ip'], 3306)) == 0: break time.sleep(1) raw_container = docker_client.containers.get(container['Id']) privileges_file = "/twindb-backup/vagrant/environment/puppet/" \ "modules/profile/files/mysql_grants.sql" raw_container.exec_run('bash -c "mysql mysql < %s"' % privileges_file) yield container if container: LOG.info('Removing container %s', container['Id']) docker_client.api.remove_container(container=container['Id'], force=True)