def test_existing_zk(self): """ ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. """ manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member1) # Test the new ClusterManager. manager2 = ClusterManager(self.client, "/home/my_cluster") assert len(manager2._cluster.members) == 2 assert member1 in manager2._cluster.members assert member2 in manager2._cluster.members assert manager2._cluster.members[member1] == ServiceInstance.pack( instance1)
def test_endpoint_from_dict(): expected = { Endpoint('smfd-akb-12-sr1', 31181): {'host': 'smfd-akb-12-sr1', 'port': 31181}, Endpoint('smfd-akb-12-sr1', 31181, '1.2.3.4'): {'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet': '1.2.3.4'}, Endpoint('smfd-akb-12-sr1', 31181, '1.2.3.4', '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'): {'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet': '1.2.3.4', 'inet6': '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'}, Endpoint('smfd-akb-12-sr1', 31181, None, '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'): {'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet6': '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'} } for (endpoint, dic) in expected.items(): assert Endpoint.to_dict(endpoint) == dic assert Endpoint.from_dict(dic) == endpoint
def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener(self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert (self.storage.paths["/home/my_cluster/master/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1)
def test_demote(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) # This demotes 'self_instance', which should cause runner to stop. manager.promote_member(another_member) assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_add_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) assert member1 == manager.add_member( instance1) # Second insertion is ignored. instance2 = ServiceInstance(Endpoint("host2", 10000)) manager.add_member(instance2) assert len(manager._cluster.members) == 2 assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2))
def test_remove_cluster(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) manager.promote_member(member1) with pytest.raises(ClusterManager.Error): manager.delete_cluster() manager.remove_member(member1) manager.remove_member(member2) manager.delete_cluster() assert "/home/my_cluster" not in self.storage.paths
def test_endpoint_inequality(): assert Endpoint('host', 8340) != Endpoint('xhost', 8340) assert Endpoint('host', 8340) != Endpoint('host', 8341) assert (Endpoint('host', 8340, '1.2.3.4', '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') != Endpoint( 'host', 8340, '5.6.7.8', '2001:db8:5678:ffff:ffff:ffff:ffff:ffff')) assert (Endpoint('host', 8340, None, '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') != Endpoint( 'host', 8340, None, '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'))
def test_remove_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.remove_member(member) assert not manager.remove_member( member) # The second deletion is ignored. assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths
def test_endpoint_equality(): assert Endpoint('host', 8340) == Endpoint('host', 8340) assert Endpoint('host', 8340, '1.2.3.4') == Endpoint('host', 8340, '1.2.3.4') assert (Endpoint('host', 8340, '1.2.3.4', '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') == Endpoint( 'host', 8340, '1.2.3.4', '2001:db8:1234:ffff:ffff:ffff:ffff:ffff')) assert (Endpoint('host', 8340, None, '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') == Endpoint( 'host', 8340, None, '2001:db8:1234:ffff:ffff:ffff:ffff:ffff'))
def test_service_instance_to_json(): json = """{ "additionalEndpoints": { "aurora": { "host": "hostname", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 22 }, "health": { "host": "hostname", "inet": "1.2.3.4", "port": 23 }, "http": { "host": "hostname", "inet": "1.2.3.4", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 23 } }, "serviceEndpoint": { "host": "hostname", "port": 24 }, "shard": 1, "status": "ALIVE" }""" service_instance = ServiceInstance( Endpoint("hostname", 24), { "aurora": Endpoint("hostname", 22, "1.2.3.4"), "health": Endpoint("hostname", 23, None, "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), "http": Endpoint("hostname", 23, "1.2.3.4", "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), }, 'ALIVE', 1) assert ServiceInstance.unpack(json) == service_instance assert ServiceInstance.unpack( ServiceInstance.pack(service_instance)) == service_instance
def test_promote_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.promote_member(member) assert not manager.promote_member( member) # The 2nd promotion is a no-op. assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance))
def test_endpoint_from_dict(): expected = { Endpoint('smfd-akb-12-sr1', 31181): { 'host': 'smfd-akb-12-sr1', 'port': 31181 }, Endpoint('smfd-akb-12-sr1', 31181, '1.2.3.4'): { 'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet': '1.2.3.4' }, Endpoint('smfd-akb-12-sr1', 31181, '1.2.3.4', '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'): { 'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet': '1.2.3.4', 'inet6': '2001:db8:5678:ffff:ffff:ffff:ffff:ffff' }, Endpoint('smfd-akb-12-sr1', 31181, None, '2001:db8:5678:ffff:ffff:ffff:ffff:ffff'): { 'host': 'smfd-akb-12-sr1', 'port': 31181, 'inet6': '2001:db8:5678:ffff:ffff:ffff:ffff:ffff' } } for (endpoint, dic) in expected.items(): assert Endpoint.to_dict(endpoint) == dic assert Endpoint.from_dict(dic) == endpoint
def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, port, zk_url = data['cluster'], data['port'], data[ 'zk_url'] _, servers, path = zookeeper.parse(zk_url) zk_client = FakeClient() zk_client.start() self_instance = ServiceInstance( Endpoint(socket.gethostbyname(socket.gethostname()), port)) task_control = self._task_control_provider.from_task(task, sandbox) return MysosTaskRunner(self_instance, zk_client, posixpath.join(path, cluster_name), NoopPackageInstaller(), task_control, Fake())
def test_endpoint_constructor(): # Check that those do not throw Endpoint('host', 8340) Endpoint('host', 8340, '1.2.3.4') Endpoint('host', 8340, None, '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') Endpoint('host', 8340, '1.2.3.4', '2001:db8:1234:ffff:ffff:ffff:ffff:ffff') with pytest.raises(ValueError): Endpoint('host', 8340, 'not an IP') with pytest.raises(ValueError): Endpoint('host', 8340, None, 'not an IPv6')
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, host, port, zk_url = data['cluster'], data['host'], data[ 'port'], data['zk_url'] _, servers, path = parse(zk_url) kazoo = KazooClient(servers) kazoo.start() self_instance = ServiceInstance(Endpoint(host, port)) try: task_control = self._task_control_provider.from_task(task, sandbox) installer = self._installer_provider.from_task(task, sandbox) backup_store = self._backup_store_provider.from_task(task, sandbox) except (TaskControl.Error, PackageInstaller.Error) as e: raise TaskError(e.message) state_manager = StateManager(sandbox, backup_store) return MysosTaskRunner(self_instance, kazoo, get_cluster_path(path, cluster_name), installer, task_control, state_manager)
def test_reparent(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() # Promote another instance. master = ServiceInstance(Endpoint("another_host", 10000)) another_member = manager.add_member(master) manager.promote_member(another_member) assert runner.master.get(True, 1) == master assert runner.stop() assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_endpoint_inequality(): assert Endpoint('host', 8340) != Endpoint('xhost', 8341)
def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._self_instance = ServiceInstance(Endpoint("host", 10000)) self._state_manager = FakeStateManager()
def test_endpoint_equality(): assert Endpoint('host', 8340) == Endpoint('host', 8340)
def test_endpoint_hash_equality(): assert Endpoint('host', 8340).__hash__() == Endpoint('host', 8340).__hash__()
def status_update(self, status): """ Handle the status update for a task of this cluster. NOTE: Duplicate status updates may be handled by either the same scheduler instance or a new instance with the restored state. """ with self._lock: task_id = status.task_id.value if task_id not in self._cluster.tasks: log.warn("Ignoring status update for unknown task %s" % task_id) return task = self._cluster.tasks[task_id] previous_state = task.state # We don't want to ignore a duplicate update if the previous one was not successfully handled. # Therefore, we should not checkpoint the status change until we have finished all operations. if previous_state == status.state: log.info('Ignoring duplicate status update %s for task %s' % (mesos_pb2.TaskState.Name(status.state), task_id)) return if is_terminal(previous_state): log.info( 'Ignoring status update %s for task %s as it is in terminal state %s' % (mesos_pb2.TaskState.Name(status.state), task_id, mesos_pb2.TaskState.Name(previous_state))) return log.info('Updating state of task %s of cluster %s from %s to %s' % (status.task_id.value, self.cluster_name, mesos_pb2.TaskState.Name(previous_state), mesos_pb2.TaskState.Name(status.state))) task.state = status.state if status.state == mesos_pb2.TASK_RUNNING: # Register this cluster member. endpoint = Endpoint(self._cluster.tasks[task_id].hostname, self._cluster.tasks[task_id].port) # If the scheduler fails over after ZK is updated but before the state change is # checkpointed, it will receive the same status update again and try to publish a duplicate # member to ZK. ClusterManager.add_member() is idempotent and doesn't update ZK in this # case. member_id = self._cluster_manager.add_member( ServiceInstance(endpoint)) log.info('Added %s (member id=%s) to cluster %s' % (endpoint, member_id, self.cluster_name)) self._cluster.members[task_id] = member_id # Checkpoint the status update here. It's OK if the elector fails to launch later because # the new scheduler instance will retry based on the fact that there are running instances # of the cluster but no master. self._state_provider.dump_cluster_state(self._cluster) # If MySQL master is already elected for this cluster don't bother adding it to the elector. if self._cluster.master_id: log.info( "MySQL slave task %s on %s started after a master is already elected for cluster %s" % (task_id, endpoint.host, self.cluster_name)) return if not self._elector: self._elector = self._new_elector() # Add current slaves. for t in self._cluster.running_tasks: self._elector.add_slave(t.task_id, t.mesos_slave_id) self._elector.start() else: self._elector.add_slave(task_id, status.slave_id.value) elif status.state == mesos_pb2.TASK_FINISHED: raise self.Error( "Task %s is in unexpected state %s with message '%s'" % (status.task_id.value, mesos_pb2.TaskState.Name(status.state), status.message)) elif is_terminal(status.state): if status.state == mesos_pb2.TASK_KILLED: log.info("Task %s was successfully killed" % status.task_id.value) else: log.error( "Task %s is now in terminal state %s with message '%s'" % (status.task_id.value, mesos_pb2.TaskState.Name( status.state), status.message)) del self._cluster.tasks[task_id] if task_id in self._cluster.members: member_id = self._cluster.members[task_id] del self._cluster.members[task_id] # If the scheduler fails over after ZK is updated but before its result is persisted, it # will receive the same status update and try to remove the non-existent member. # Removing a non-existent member is a no-op for ClusterManager.remove_member(). # Note that if the order is reversed, the scheduler will fail to clean up the orphan ZK # entry. self._cluster_manager.remove_member(member_id) if member_id == self._cluster.master_id: self._cluster.master_id = None log.info( "Master of cluster %s has terminated. Restarting election" % self.cluster_name) assert not self._elector, "Election must not be running since there is a current master" self._elector = self._new_elector() # Add current slaves after removing the terminated task. for t in self._cluster.running_tasks: self._elector.add_slave(t.task_id, t.mesos_slave_id) self._elector.start() else: # It will be rescheduled next time the launcher is given an offer. log.info("Slave %s of cluster %s has terminated" % (task_id, self.cluster_name)) else: assert previous_state != mesos_pb2.TASK_RUNNING, ( "Task must exist in ClusterManager if it was running") log.warn("Slave %s of cluster %s failed to start running" % (task_id, self.cluster_name)) if self.terminated: log.info("Shutting down launcher for cluster %s" % self.cluster_name) self._shutdown() return # Finally, checkpoint the status update. self._state_provider.dump_cluster_state(self._cluster) log.info( "Checkpointed the status update for task %s of cluster %s" % (task_id, self.cluster_name))
def test_endpoint_hash_inequality(): assert Endpoint('host', 8340).__hash__() != Endpoint('xhost', 8341).__hash__()