def test_launch_cluster_all_nodes_successful(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2
def _shutdown(self, status_result): runner_status = self._runner.status try: deadline(self._runner.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop runner within deadline.') try: deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop all checkers within deadline.') # If the runner was alive when _shutdown was called, defer to the status_result, # otherwise the runner's terminal state is the preferred state. exit_status = runner_status or status_result self.send_update( self._driver, self._task_id, self.translate_exit_state_to_mesos(exit_status.status), status_result.reason) self.terminated.set() defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
def test_scheduler_runs(): """ Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks. NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to executors in separate processes but they are unit-tested separately. """ import mesos.native # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave. assert os.path.isfile('dist/fake_mysos_executor.pex') storage = FakeStorage(SequentialThreadingHandler()) zk_client = FakeClient(storage=storage) zk_client.start() zk_url = "zk://fake_host/home/mysos/clusters" cluster_name = "test_cluster" num_nodes = 3 state_provider = LocalStateProvider(safe_mkdtemp()) framework_info = FrameworkInfo( user=getpass.getuser(), name="mysos", checkpoint=False) state = Scheduler(framework_info) scheduler = MysosScheduler( state, state_provider, getpass.getuser(), os.path.abspath("dist/fake_mysos_executor.pex"), "./fake_mysos_executor.pex", zk_client, zk_url, Amount(40, Time.SECONDS), "/fakepath", gen_encryption_key()) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, "local") scheduler_driver.start() # Wait until the scheduler is connected and becomes available. assert scheduler.connected.wait(30) scheduler.create_cluster(cluster_name, "mysql_user", num_nodes) # A slave is promoted to be the master. deadline( lambda: wait_for_master( get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), zk_client), Amount(40, Time.SECONDS)) assert scheduler_driver.stop() == DRIVER_STOPPED
def wait_until_not(thing, clock=time, timeout=1.0): """wait until something is booleany False""" def wait(): while thing(): clock.sleep(1.0) try: deadline(wait, timeout=timeout, daemon=True) except Timeout: pass
def wait_until_not(thing, timeout=EVENT_WAIT_TIMEOUT_SECS): """wait until something is booleany False""" def wait(): while thing(): time.sleep(0.1) try: deadline(wait, timeout=timeout, daemon=True) return True except Timeout: return False
def test_launcher_kill(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"] ) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self. _zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([ x for x in self._storage.paths.keys() if x.startswith("/mysos/test/cluster0/slaves/member_") ]) == 2 # Kill the cluster. with pytest.raises(MySQLClusterLauncher.PermissionError): self._launcher.kill("wrong_password") self._launcher.kill(self._cluster.password) # Correct password. # All 3 nodes are successfully killed. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_KILLED status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) assert "/mysos/test/cluster0" not in self._storage.paths # ServerSets removed. assert not self._state_provider.load_cluster_state( "cluster0") # State removed.
def test_launcher_kill(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Kill the cluster. with pytest.raises(MySQLClusterLauncher.PermissionError): self._launcher.kill("wrong_password") # Correct password. self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password)) # All 3 nodes are successfully killed. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_KILLED status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) assert "/mysos/test/cluster0" not in self._storage.paths # ServerSets removed. assert not self._state_provider.load_cluster_state("cluster0") # State removed.
def _initialize_sandbox(self, driver, assigned_task): self._sandbox = self._sandbox_provider.from_assigned_task(assigned_task) self.sandbox_initialized.set() try: deadline(self._sandbox.create, timeout=self.SANDBOX_INITIALIZATION_TIMEOUT, daemon=True, propagate=True) except Timeout: self._die(driver, mesos_pb.TASK_FAILED, 'Timed out waiting for sandbox to initialize!') return except self._sandbox.Error as e: self._die(driver, mesos_pb.TASK_FAILED, 'Failed to initialize sandbox: %s' % e) return self.sandbox_created.set() return True
def test_scheduler_runs(): """ Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks. NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to executors in separate processes but they are unit-tested separately. """ import mesos.native # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave. assert os.path.isfile('dist/fake_mysos_executor.pex') storage = FakeStorage(SequentialThreadingHandler()) zk_client = FakeClient(storage=storage) zk_client.start() zk_url = "zk://fake_host/home/mysos/clusters" cluster_name = "test_cluster" num_nodes = 3 state_provider = LocalStateProvider(safe_mkdtemp()) framework_info = FrameworkInfo(user=getpass.getuser(), name="mysos", checkpoint=False) state = Scheduler(framework_info) scheduler = MysosScheduler(state, state_provider, getpass.getuser(), os.path.abspath("dist/fake_mysos_executor.pex"), "./fake_mysos_executor.pex", zk_client, zk_url, Amount(40, Time.SECONDS), "/fakepath", gen_encryption_key()) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, "local") scheduler_driver.start() # Wait until the scheduler is connected and becomes available. assert scheduler.connected.wait(30) scheduler.create_cluster(cluster_name, "mysql_user", num_nodes) # A slave is promoted to be the master. deadline( lambda: wait_for_master( get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), zk_client), Amount(40, Time.SECONDS)) assert scheduler_driver.stop() == DRIVER_STOPPED
def fetch(self, uri, directory): log.info("Fetching %s from HDFS" % uri) if "JAVA_HOME" in os.environ: log.info("Using JAVA_HOME '%s' for HDFS commands" % os.environ["JAVA_HOME"]) config = os.environ.get("HADOOP_CONF_DIR", HADOOP_CONF_DIR) h = HDFSHelper(config, heap_limit=Amount(256, Data.MB)) try: f = lambda: h.copy_to_local(uri, directory) deadline(f, timeout=self._timeout, propagate=True, daemon=True) except HDFSHelper.InternalError as e: raise self.Error('Unable to fetch HDFS package: %s' % e) except Timeout as e: raise self.Error("Failed to fetch package from HDFS within : %s" % (self._timeout, e))
def test_demote(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) # This demotes 'self_instance', which should cause runner to stop. manager.promote_member(another_member) assert deadline(runner.join, Amount(1, Time.SECONDS))
def _start_runner(self, driver, assigned_task, mesos_task, portmap): if self.runner_aborted.is_set(): self._die(driver, mesos_pb.TASK_KILLED, 'Task killed during initialization.') try: deadline(self._runner.start, timeout=self.START_TIMEOUT, propagate=True) except TaskError as e: self._die(driver, mesos_pb.TASK_FAILED, 'Task initialization failed: %s' % e) return False except Timeout: self._die(driver, mesos_pb.TASK_LOST, 'Timed out waiting for task to start!') return False self.runner_started.set() log.debug('Task started.') return True
def test_launch_cluster_insufficient_resources(self): """All but one slave in the slave are launched successfully.""" del self._offer.resources[:] resources = create_resources( cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB), # 1mb less than required disk space. ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) # There is one fewer port than required to launch the entire cluster. for i in range(self._cluster.num_nodes - 1): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes - 1 # The final task cannot get launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1 # The two nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes - 1): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # One slave. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 1
def test_mysqld_error(self): task_control = FakeTaskControl(mysqld="exit 123") runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) runner.start() assert deadline(runner.join, Amount(1, Time.SECONDS)) == 123
def test_stop(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) runner.start() assert runner.stop() # Killed by SIGTERM. assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGTERM
def test_stop_interminable(self): cmd = """trap "echo Trapped SIGTERM!" TERM while : do sleep 60 done """ task_control = FakeTaskControl(mysqld=cmd) runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) task_control._mysqld = cmd runner.start() assert runner.stop(timeout=1) assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGKILL
def test_reparent(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() # Promote another instance. master = ServiceInstance(Endpoint("another_host", 10000)) another_member = manager.add_member(master) manager.promote_member(another_member) assert runner.master.get(True, 1) == master assert runner.stop() assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_deadline_no_timeout(): assert 'success' == deadline(lambda: 'success')
def propagate_deadline(*args, **kw): return deadline(*args, daemon=True, propagate=True, **kw)
def test_deadline_no_timeout(): assert "success" == deadline(lambda: "success")
def test_master_failover(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) # No log positions queries are sent for the first epoch. assert "sendFrameworkMessage" not in self._driver.method_calls # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) assert len(self._launcher._cluster.running_tasks) == 2 # Log positions queries are sent. self._launcher._elector._elect() assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2 for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The slave with the highest position is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths assert len(self._launcher._cluster.running_tasks) == 2 # When a new offer comes in, a new task is launched. del self._offer.resources[:] resources = create_resources( cpus=DEFAULT_TASK_CPUS, mem=DEFAULT_TASK_MEM, disk=DEFAULT_TASK_DISK, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = self._launcher.launch(self._offer) assert task_id == "mysos-cluster0-3" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the failed one. assert len(launched) == self._cluster.num_nodes + 1
def test_deadline_default_timeout(): timeout = partial(time.sleep, 0.5) with pytest.raises(Timeout): deadline(timeout)
def test_deadline_custom_timeout(): timeout = partial(time.sleep, 0.2) with pytest.raises(Timeout): deadline(timeout, 0.1)
def test_launcher_recovery_before_election_completed(self): # 1. Launch a cluster on the running launcher. for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Now fail the master task which leads to re-election. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=2, position=str(i)))) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The second slave has the larger position and is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
def test_master_failover(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) # No log positions queries are sent for the first epoch. assert "sendFrameworkMessage" not in self._driver.method_calls # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) assert len(self._launcher._cluster.running_tasks) == 2 # Log positions queries are sent. self._launcher._elector._elect() assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2 for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The slave with the highest position is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths assert len(self._launcher._cluster.running_tasks) == 2 # When a new offer comes in, a new task is launched. del self._offer.resources[:] resources = create_resources(cpus=1, mem=512, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = self._launcher.launch(self._offer) assert task_id == "mysos-cluster0-3" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the failed one. assert len(launched) == self._cluster.num_nodes + 1