Пример #1
0
    def test_existing_zk(self):
        """
      ClusterManager needs to be able to recover from an existing ZK group for scheduler failover.
    """
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))

        manager.promote_member(member1)

        # Test the new ClusterManager.
        manager2 = ClusterManager(self.client, "/home/my_cluster")
        assert len(manager2._cluster.members) == 2
        assert member1 in manager2._cluster.members
        assert member2 in manager2._cluster.members
        assert manager2._cluster.members[member1] == ServiceInstance.pack(
            instance1)
Пример #2
0
    def test_callbacks(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        # Set up 2 listeners.
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(self.client, "/home/my_cluster", instance1,
                                    handler1.promotion_callback,
                                    handler1.demotion_callback,
                                    handler1.master_callback,
                                    handler1.termination_callback)
        listener1.start()
        member1 = manager.add_member(instance1)

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        handler2 = CallbackHandler()
        listener2 = ClusterListener(self.client, "/home/my_cluster", instance2,
                                    handler2.promotion_callback,
                                    handler2.demotion_callback,
                                    handler2.master_callback)
        listener2.start()
        member2 = manager.add_member(instance2)

        # Test promotion.
        manager.promote_member(member1)

        assert handler1.promoted.wait(1)
        assert handler2.detected.get(True, 1) == instance1

        assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))

        manager.promote_member(member2)

        assert handler1.demoted.wait(1)
        assert handler2.promoted.wait(1)

        assert (self.storage.paths["/home/my_cluster/master/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))
        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths

        manager.remove_member(member2)
        assert handler2.demoted.wait(1)

        # Test removing cluster.
        manager.remove_member(member1)
        manager.delete_cluster()
        assert handler1.terminated.wait(1)
  def test_demote(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    self_member = manager.add_member(self._self_instance)

    # 'self_instance' becomes the master.
    manager.promote_member(self_member)

    runner.promoted.wait(1)

    another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000)))

    # This demotes 'self_instance', which should cause runner to stop.
    manager.promote_member(another_member)

    assert deadline(runner.join, Amount(1, Time.SECONDS))
Пример #4
0
    def test_add_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        assert member1 == manager.add_member(
            instance1)  # Second insertion is ignored.

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        manager.add_member(instance2)

        assert len(manager._cluster.members) == 2

        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))
Пример #5
0
    def test_remove_cluster(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        manager.promote_member(member1)

        with pytest.raises(ClusterManager.Error):
            manager.delete_cluster()

        manager.remove_member(member1)
        manager.remove_member(member2)
        manager.delete_cluster()

        assert "/home/my_cluster" not in self.storage.paths
Пример #6
0
    def test_remove_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.remove_member(member)
        assert not manager.remove_member(
            member)  # The second deletion is ignored.

        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths
Пример #7
0
    def test_promote_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.promote_member(member)
        assert not manager.promote_member(
            member)  # The 2nd promotion is a no-op.

        assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance))
Пример #8
0
    def from_task(self, task, sandbox):
        data = json.loads(task.data)
        cluster_name, port, zk_url = data['cluster'], data['port'], data[
            'zk_url']

        _, servers, path = zookeeper.parse(zk_url)

        zk_client = FakeClient()
        zk_client.start()
        self_instance = ServiceInstance(
            Endpoint(socket.gethostbyname(socket.gethostname()), port))
        task_control = self._task_control_provider.from_task(task, sandbox)

        return MysosTaskRunner(self_instance, zk_client,
                               posixpath.join(path, cluster_name),
                               NoopPackageInstaller(), task_control, Fake())
Пример #9
0
    def test_invalid_znode(self):
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(self.client, "/home/my_cluster", instance1,
                                    handler1.promotion_callback,
                                    handler1.demotion_callback,
                                    handler1.master_callback)
        listener1.start()

        self.client.ensure_path("/home/my_cluster/master")
        self.client.create("/home/my_cluster/master/member_",
                           "Invalid Data",
                           sequence=True)

        # Invalid ZNode data translates into a 'None' return.
        assert handler1.detected.get(True, 1) is None
Пример #10
0
def test_service_instance_to_json():
    json = """{
    "additionalEndpoints": {
        "aurora": {
            "host": "hostname",
            "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff",
            "port": 22
        },
        "health": {
            "host": "hostname",
            "inet": "1.2.3.4",
            "port": 23
        },
        "http": {
            "host": "hostname",
            "inet": "1.2.3.4",
            "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff",
            "port": 23
        }
    },
    "serviceEndpoint": {
        "host": "hostname",
        "port": 24
    },
    "shard": 1,
    "status": "ALIVE"
  }"""
    service_instance = ServiceInstance(
        Endpoint("hostname", 24), {
            "aurora":
            Endpoint("hostname", 22, "1.2.3.4"),
            "health":
            Endpoint("hostname", 23, None,
                     "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"),
            "http":
            Endpoint("hostname", 23, "1.2.3.4",
                     "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"),
        }, 'ALIVE', 1)

    assert ServiceInstance.unpack(json) == service_instance
    assert ServiceInstance.unpack(
        ServiceInstance.pack(service_instance)) == service_instance
Пример #11
0
    def from_task(self, task, sandbox):
        data = json.loads(task.data)
        cluster_name, host, port, zk_url = data['cluster'], data['host'], data[
            'port'], data['zk_url']
        _, servers, path = parse(zk_url)
        kazoo = KazooClient(servers)
        kazoo.start()
        self_instance = ServiceInstance(Endpoint(host, port))

        try:
            task_control = self._task_control_provider.from_task(task, sandbox)
            installer = self._installer_provider.from_task(task, sandbox)
            backup_store = self._backup_store_provider.from_task(task, sandbox)
        except (TaskControl.Error, PackageInstaller.Error) as e:
            raise TaskError(e.message)

        state_manager = StateManager(sandbox, backup_store)

        return MysosTaskRunner(self_instance, kazoo,
                               get_cluster_path(path, cluster_name), installer,
                               task_control, state_manager)
  def test_reparent(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    # Promote another instance.
    master = ServiceInstance(Endpoint("another_host", 10000))
    another_member = manager.add_member(master)
    manager.promote_member(another_member)

    assert runner.master.get(True, 1) == master

    assert runner.stop()
    assert deadline(runner.join, Amount(1, Time.SECONDS))
Пример #13
0
    def status_update(self, status):
        """
      Handle the status update for a task of this cluster.

      NOTE:
        Duplicate status updates may be handled by either the same scheduler instance or a new
        instance with the restored state.
    """
        with self._lock:
            task_id = status.task_id.value

            if task_id not in self._cluster.tasks:
                log.warn("Ignoring status update for unknown task %s" %
                         task_id)
                return

            task = self._cluster.tasks[task_id]
            previous_state = task.state

            # We don't want to ignore a duplicate update if the previous one was not successfully handled.
            # Therefore, we should not checkpoint the status change until we have finished all operations.
            if previous_state == status.state:
                log.info('Ignoring duplicate status update %s for task %s' %
                         (mesos_pb2.TaskState.Name(status.state), task_id))
                return

            if is_terminal(previous_state):
                log.info(
                    'Ignoring status update %s for task %s as it is in terminal state %s'
                    % (mesos_pb2.TaskState.Name(status.state), task_id,
                       mesos_pb2.TaskState.Name(previous_state)))
                return

            log.info('Updating state of task %s of cluster %s from %s to %s' %
                     (status.task_id.value, self.cluster_name,
                      mesos_pb2.TaskState.Name(previous_state),
                      mesos_pb2.TaskState.Name(status.state)))
            task.state = status.state

            if status.state == mesos_pb2.TASK_RUNNING:
                # Register this cluster member.
                endpoint = Endpoint(self._cluster.tasks[task_id].hostname,
                                    self._cluster.tasks[task_id].port)

                # If the scheduler fails over after ZK is updated but before the state change is
                # checkpointed, it will receive the same status update again and try to publish a duplicate
                # member to ZK. ClusterManager.add_member() is idempotent and doesn't update ZK in this
                # case.
                member_id = self._cluster_manager.add_member(
                    ServiceInstance(endpoint))
                log.info('Added %s (member id=%s) to cluster %s' %
                         (endpoint, member_id, self.cluster_name))
                self._cluster.members[task_id] = member_id

                # Checkpoint the status update here. It's OK if the elector fails to launch later because
                # the new scheduler instance will retry based on the fact that there are running instances
                # of the cluster but no master.
                self._state_provider.dump_cluster_state(self._cluster)

                # If MySQL master is already elected for this cluster don't bother adding it to the elector.
                if self._cluster.master_id:
                    log.info(
                        "MySQL slave task %s on %s started after a master is already elected for cluster %s"
                        % (task_id, endpoint.host, self.cluster_name))
                    return

                if not self._elector:
                    self._elector = self._new_elector()
                    # Add current slaves.
                    for t in self._cluster.running_tasks:
                        self._elector.add_slave(t.task_id, t.mesos_slave_id)
                    self._elector.start()
                else:
                    self._elector.add_slave(task_id, status.slave_id.value)
            elif status.state == mesos_pb2.TASK_FINISHED:
                raise self.Error(
                    "Task %s is in unexpected state %s with message '%s'" %
                    (status.task_id.value,
                     mesos_pb2.TaskState.Name(status.state), status.message))
            elif is_terminal(status.state):
                if status.state == mesos_pb2.TASK_KILLED:
                    log.info("Task %s was successfully killed" %
                             status.task_id.value)
                else:
                    log.error(
                        "Task %s is now in terminal state %s with message '%s'"
                        % (status.task_id.value,
                           mesos_pb2.TaskState.Name(
                               status.state), status.message))
                del self._cluster.tasks[task_id]

                if task_id in self._cluster.members:
                    member_id = self._cluster.members[task_id]
                    del self._cluster.members[task_id]

                    # If the scheduler fails over after ZK is updated but before its result is persisted, it
                    # will receive the same status update and try to remove the non-existent member.
                    # Removing a non-existent member is a no-op for ClusterManager.remove_member().
                    # Note that if the order is reversed, the scheduler will fail to clean up the orphan ZK
                    # entry.
                    self._cluster_manager.remove_member(member_id)

                    if member_id == self._cluster.master_id:
                        self._cluster.master_id = None
                        log.info(
                            "Master of cluster %s has terminated. Restarting election"
                            % self.cluster_name)

                        assert not self._elector, "Election must not be running since there is a current master"
                        self._elector = self._new_elector()

                        # Add current slaves after removing the terminated task.
                        for t in self._cluster.running_tasks:
                            self._elector.add_slave(t.task_id,
                                                    t.mesos_slave_id)
                        self._elector.start()
                    else:
                        # It will be rescheduled next time the launcher is given an offer.
                        log.info("Slave %s of cluster %s has terminated" %
                                 (task_id, self.cluster_name))
                else:
                    assert previous_state != mesos_pb2.TASK_RUNNING, (
                        "Task must exist in ClusterManager if it was running")
                    log.warn("Slave %s of cluster %s failed to start running" %
                             (task_id, self.cluster_name))

                if self.terminated:
                    log.info("Shutting down launcher for cluster %s" %
                             self.cluster_name)
                    self._shutdown()
                    return

                # Finally, checkpoint the status update.
                self._state_provider.dump_cluster_state(self._cluster)
                log.info(
                    "Checkpointed the status update for task %s of cluster %s"
                    % (task_id, self.cluster_name))
 def setUp(self):
   self._storage = FakeStorage(SequentialThreadingHandler())
   self._client = FakeClient(storage=self._storage)
   self._client.start()
   self._self_instance = ServiceInstance(Endpoint("host", 10000))
   self._state_manager = FakeStateManager()