Exemplo n.º 1
0
    def setUp(self):
        self._driver = FakeDriver()
        self._storage = FakeStorage(SequentialThreadingHandler())
        self._zk_client = FakeClient(storage=self._storage)
        self._zk_client.start()

        self._framework_id = mesos_pb2.FrameworkID()
        self._framework_id.value = "framework_id_0"

        self._offer = mesos_pb2.Offer()
        self._offer.id.value = "offer_id_0"
        self._offer.framework_id.value = self._framework_id.value
        self._offer.slave_id.value = "slave_id_0"
        self._offer.hostname = "localhost"

        resources = create_resources(cpus=4,
                                     mem=512 * 3,
                                     ports=set([10000, 10001, 10002]))
        self._offer.resources.extend(resources)

        self._framework_user = "******"

        self._zk_url = "zk://host/mysos/test"
        self._cluster = MySQLCluster("cluster0", "user", "pass", 3)

        self._tmpdir = tempfile.mkdtemp()
        self._state_provider = LocalStateProvider(self._tmpdir)

        framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(),
                                                 name="mysos",
                                                 checkpoint=False)
        self._state = Scheduler(framework_info)
Exemplo n.º 2
0
def test_scheduler_runs():
  """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
  import mesos.native

  # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
  assert os.path.isfile('dist/fake_mysos_executor.pex')

  storage = FakeStorage(SequentialThreadingHandler())
  zk_client = FakeClient(storage=storage)
  zk_client.start()

  zk_url = "zk://fake_host/home/mysos/clusters"
  cluster_name = "test_cluster"
  num_nodes = 3

  state_provider = LocalStateProvider(safe_mkdtemp())

  framework_info = FrameworkInfo(
      user=getpass.getuser(),
      name="mysos",
      checkpoint=False)

  state = Scheduler(framework_info)

  scheduler = MysosScheduler(
      state,
      state_provider,
      getpass.getuser(),
      os.path.abspath("dist/fake_mysos_executor.pex"),
      "./fake_mysos_executor.pex",
      zk_client,
      zk_url,
      Amount(40, Time.SECONDS),
      "/fakepath",
      gen_encryption_key())

  scheduler_driver = mesos.native.MesosSchedulerDriver(
      scheduler,
      framework_info,
      "local")
  scheduler_driver.start()

  # Wait until the scheduler is connected and becomes available.
  assert scheduler.connected.wait(30)

  scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

  # A slave is promoted to be the master.
  deadline(
      lambda: wait_for_master(
          get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
          zk_client),
      Amount(40, Time.SECONDS))

  assert scheduler_driver.stop() == DRIVER_STOPPED
Exemplo n.º 3
0
    def test_invalid_arguments(self):
        client = FakeClient()
        client.start()
        manager = ClusterManager(client, "/home/my_cluster")

        with pytest.raises(ValueError) as e:
            manager.promote_member("123")
        assert e.value.message == "Invalid member_id: 123"
Exemplo n.º 4
0
    def test_invalid_arguments(self):
        client = FakeClient()
        client.start()
        manager = ClusterManager(client, "/home/my_cluster")

        with pytest.raises(ValueError) as e:
            manager.promote_member("123")
        assert e.value.message == 'Invalid member_id: 123'
Exemplo n.º 5
0
    def setup(self, request):
        self._driver = FakeDriver()
        self._storage = FakeStorage(SequentialThreadingHandler())
        self._zk_client = FakeClient(storage=self._storage)
        self._zk_client.start()

        self._offer = mesos_pb2.Offer()
        self._offer.id.value = "offer_id_0"
        self._offer.framework_id.value = "framework_id_0"
        self._offer.slave_id.value = "slave_id_0"
        self._offer.hostname = "localhost"

        # Enough memory and ports to fit three tasks.
        resources = create_resources(cpus=4,
                                     mem=512 * 3,
                                     ports=set([10000, 10001, 10002]))
        self._offer.resources.extend(resources)

        self._framework_user = "******"

        # Some tests use the default launcher; some don't.
        self._zk_url = "zk://host/mysos/test"
        self._cluster = MySQLCluster("cluster0", "user", "pass", 3)

        # Construct the state provider based on the test parameter.
        if request.param == LocalStateProvider:
            tmpdir = tempfile.mkdtemp()
            self._state_provider = LocalStateProvider(tmpdir)
            request.addfinalizer(lambda: shutil.rmtree(tmpdir, True)
                                 )  # Clean up after ourselves.
        elif request.param == ZooKeeperStateProvider:
            self._state_provider = ZooKeeperStateProvider(
                self._zk_client, "/mysos/test")

        self._launcher = MySQLClusterLauncher(
            self._driver,
            self._cluster,
            self._state_provider,
            self._zk_url,
            self._zk_client,
            self._framework_user,
            "./executor.pex",
            "cmd.sh",
            Amount(5, Time.SECONDS),
            "/etc/mysos/admin_keyfile.yml",
            query_interval=Amount(150, Time.MILLISECONDS))  # Short interval.

        self._elected = threading.Event()
        self._launchers = [self._launcher]  # See teardown().

        request.addfinalizer(self.teardown)
Exemplo n.º 6
0
def test_scheduler_runs():
    """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
    import mesos.native

    # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
    assert os.path.isfile('dist/fake_mysos_executor.pex')

    storage = FakeStorage(SequentialThreadingHandler())
    zk_client = FakeClient(storage=storage)
    zk_client.start()

    zk_url = "zk://fake_host/home/mysos/clusters"
    cluster_name = "test_cluster"
    num_nodes = 3

    state_provider = LocalStateProvider(safe_mkdtemp())

    framework_info = FrameworkInfo(user=getpass.getuser(),
                                   name="mysos",
                                   checkpoint=False)

    state = Scheduler(framework_info)

    scheduler = MysosScheduler(state, state_provider, getpass.getuser(),
                               os.path.abspath("dist/fake_mysos_executor.pex"),
                               "./fake_mysos_executor.pex", zk_client, zk_url,
                               Amount(40, Time.SECONDS), "/fakepath",
                               gen_encryption_key())

    scheduler_driver = mesos.native.MesosSchedulerDriver(
        scheduler, framework_info, "local")
    scheduler_driver.start()

    # Wait until the scheduler is connected and becomes available.
    assert scheduler.connected.wait(30)

    scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

    # A slave is promoted to be the master.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
            zk_client), Amount(40, Time.SECONDS))

    assert scheduler_driver.stop() == DRIVER_STOPPED
Exemplo n.º 7
0
  def setUp(self):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._framework_id = mesos_pb2.FrameworkID()
    self._framework_id.value = "framework_id_0"

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = self._framework_id.value
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    self._zk_url = "zk://host/mysos/test"
    self._cluster = MySQLCluster("cluster0", "user", "pass", 3)

    self._tmpdir = tempfile.mkdtemp()
    self._state_provider = LocalStateProvider(self._tmpdir)

    framework_info = mesos_pb2.FrameworkInfo(
        user=getpass.getuser(),
        name="mysos",
        checkpoint=False)
    self._state = Scheduler(framework_info)
Exemplo n.º 8
0
    def from_task(self, task, sandbox):
        data = json.loads(task.data)
        cluster_name, port, zk_url = data['cluster'], data['port'], data[
            'zk_url']

        _, servers, path = zookeeper.parse(zk_url)

        zk_client = FakeClient()
        zk_client.start()
        self_instance = ServiceInstance(
            Endpoint(socket.gethostbyname(socket.gethostname()), port))
        task_control = self._task_control_provider.from_task(task, sandbox)

        return MysosTaskRunner(self_instance, zk_client,
                               posixpath.join(path, cluster_name),
                               NoopPackageInstaller(), task_control, Fake())
Exemplo n.º 9
0
  def from_task(self, task, sandbox):
    data = json.loads(task.data)
    cluster_name, port, zk_url = data['cluster'], data['port'], data['zk_url']

    _, servers, path = zookeeper.parse(zk_url)

    zk_client = FakeClient()
    zk_client.start()
    self_instance = ServiceInstance(Endpoint(socket.gethostbyname(socket.gethostname()), port))
    task_control = self._task_control_provider.from_task(task, sandbox)

    return MysosTaskRunner(
        self_instance,
        zk_client,
        posixpath.join(path, cluster_name),
        NoopPackageInstaller(),
        task_control,
        Fake())
Exemplo n.º 10
0
  def setup(self, request):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = "framework_id_0"
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    # Enough memory and ports to fit three tasks.
    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    # Some tests use the default launcher; some don't.
    self._zk_url = "zk://host/mysos/test"

    self._scheduler_key = gen_encryption_key()
    self._password_box = PasswordBox(self._scheduler_key)

    self._cluster = MySQLCluster("cluster0", "user", self._password_box.encrypt("pass"), 3)

    # Construct the state provider based on the test parameter.
    if request.param == LocalStateProvider:
      tmpdir = tempfile.mkdtemp()
      self._state_provider = LocalStateProvider(tmpdir)
      request.addfinalizer(lambda: shutil.rmtree(tmpdir, True))  # Clean up after ourselves.
    elif request.param == ZooKeeperStateProvider:
      self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test")

    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))  # Short interval.

    self._elected = threading.Event()
    self._launchers = [self._launcher]  # See teardown().

    request.addfinalizer(self.teardown)
Exemplo n.º 11
0
    def test_return_immediately_when_blocking_on_empty_queue_and_available_task_comes_in(
        self
    ):
        client = FakeClient()
        client.start()
        queue = ZKDelayDeadlineQueue(client, "/")

        """
        Set up several threads waiting for work; insert several pieces of work; make sure each thread finishes.
        """
        tpe = ThreadPoolExecutor()

        def time_get():
            queue = ZKDelayDeadlineQueue(client, "/")
            start_time = time.time()
            with queue.get(timeout=1.0) as si:
                pass
            return time.time() - start_time, si

        fut1 = tpe.submit(time_get)
        fut2 = tpe.submit(time_get)
        fut3 = tpe.submit(time_get)

        begin = time.time()
        si1 = make_si(wait_until=begin, bounce_by=begin)
        queue.put(si1)
        si2 = make_si(wait_until=begin + 0.01, bounce_by=begin + 0.01)
        queue.put(si2)
        si3 = make_si(wait_until=begin + 0.02, bounce_by=begin + 0.02)
        queue.put(si3)

        times = sorted([x.result(timeout=2.0) for x in [fut1, fut2, fut3]])
        assert times[0][0] < 0.011
        assert times[0][1] == si1
        assert 0.009 < times[1][0] < 0.021
        assert times[1][1] == si2
        assert 0.019 < times[2][0] < 0.031
        assert times[2][1] == si3
Exemplo n.º 12
0
async def test_functional():
    """Test as much of the whole stack as we can."""
    config = {
        'deadman': {
            'plugins':
            'zgres#zookeeper\nzgres#apt\nzgres#ec2-snapshot\nzgres#ec2\nzgres#follow-the-leader\nzgres#select-furthest-ahead-replica',
        },
        'apt': {
            'postgresql_cluster_name': 'main',
            'postgresql_version': '9.5',
        },
    }
    zk = FakeClient()
    with mock.patch('zgres.zookeeper.KazooClient') as KazooClient, \
            mock.patch('zgres.ec2.boto.utils.get_instance_metadata'):
        KazooClient.return_value = zk
        app = deadman.App(config)
Exemplo n.º 13
0
class TestScheduler(unittest.TestCase):
    def setUp(self):
        self._driver = FakeDriver()
        self._storage = FakeStorage(SequentialThreadingHandler())
        self._zk_client = FakeClient(storage=self._storage)
        self._zk_client.start()

        self._framework_id = mesos_pb2.FrameworkID()
        self._framework_id.value = "framework_id_0"

        self._offer = mesos_pb2.Offer()
        self._offer.id.value = "offer_id_0"
        self._offer.framework_id.value = self._framework_id.value
        self._offer.slave_id.value = "slave_id_0"
        self._offer.hostname = "localhost"

        resources = create_resources(cpus=DEFAULT_TASK_CPUS * 3,
                                     mem=DEFAULT_TASK_MEM * 3,
                                     disk=DEFAULT_TASK_DISK * 3,
                                     ports=set([10000, 10001, 10002]))
        self._offer.resources.extend(resources)

        self._framework_user = "******"

        self._zk_url = "zk://host/mysos/test"
        self._cluster = MySQLCluster("cluster0", "user", "pass", 3,
                                     DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM,
                                     DEFAULT_TASK_DISK)

        self._tmpdir = tempfile.mkdtemp()
        self._state_provider = LocalStateProvider(self._tmpdir)

        framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(),
                                                 name="mysos",
                                                 checkpoint=False)
        self._state = Scheduler(framework_info)

    def tearDown(self):
        shutil.rmtree(self._tmpdir, True)  # Clean up after ourselves.

    def test_scheduler_recovery(self):
        scheduler_key = gen_encryption_key()

        scheduler1 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml",
                                    scheduler_key)
        scheduler1.registered(self._driver, self._framework_id, object())
        scheduler1.create_cluster("cluster1", "mysql_user", 3)
        scheduler1.resourceOffers(self._driver, [self._offer])

        # One task is launched for one offer.
        assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1

        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler1.create_cluster("cluster1", "mysql_user", 3)

        # FrameworkID should have been persisted.
        self._state = self._state_provider.load_scheduler_state()
        assert self._state.framework_info.id.value == self._framework_id.value

        # Simulate restart.
        scheduler2 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml",
                                    scheduler_key)

        # Scheduler always receives registered() with the same FrameworkID after failover.
        scheduler2.registered(self._driver, self._framework_id, object())

        assert len(scheduler2._launchers) == 1
        assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

        # Scheduler has recovered the cluster so it doesn't accept another of the same name.
        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler2.create_cluster("cluster1", "mysql_user", 3)

    def test_scheduler_recovery_failure_before_launch(self):
        scheduler_key = gen_encryption_key()

        scheduler1 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml",
                                    scheduler_key)
        scheduler1.registered(self._driver, self._framework_id, object())
        _, password = scheduler1.create_cluster("cluster1", "mysql_user", 3)

        # Simulate restart before the task is successfully launched.
        scheduler2 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml",
                                    scheduler_key)

        assert len(scheduler2._launchers) == 0  # No launchers are recovered.

        # Scheduler always receives registered() with the same FrameworkID after failover.
        scheduler2.registered(self._driver, self._framework_id, object())

        assert len(scheduler2._launchers) == 1
        assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

        password_box = PasswordBox(scheduler_key)

        assert password_box.match(
            password,
            scheduler2._launchers["cluster1"]._cluster.encrypted_password)

        # Now offer the resources for this task.
        scheduler2.resourceOffers(self._driver, [self._offer])

        # One task is launched for the offer.
        assert len(
            scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1

        # Scheduler has recovered the cluster so it doesn't accept another of the same name.
        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler2.create_cluster("cluster1", "mysql_user", 3)

    def test_incompatible_resource_role(self):
        scheduler1 = MysosScheduler(
            self._state,
            self._state_provider,
            self._framework_user,
            "./executor.pex",
            "cmd.sh",
            self._zk_client,
            self._zk_url,
            Amount(5, Time.SECONDS),
            "/etc/mysos/admin_keyfile.yml",
            gen_encryption_key(),
            framework_role='mysos'
        )  # Require 'mysos' but the resources are in '*'.
        scheduler1.registered(self._driver, self._framework_id, object())
        scheduler1.create_cluster("cluster1", "mysql_user", 3)
        scheduler1.resourceOffers(self._driver, [self._offer])

        assert "declineOffer" in self._driver.method_calls
        assert len(self._driver.method_calls["declineOffer"]) == 1
        # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is
        # a 'Filters' object.
        assert (self._driver.method_calls["declineOffer"][0][0][1].
                refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(
                    Time.SECONDS))

    def test_scheduler_metrics(self):
        scheduler_key = gen_encryption_key()

        scheduler = MysosScheduler(self._state, self._state_provider,
                                   self._framework_user, "./executor.pex",
                                   "cmd.sh", self._zk_client, self._zk_url,
                                   Amount(5, Time.SECONDS),
                                   "/etc/mysos/admin_keyfile.yml",
                                   scheduler_key)

        RootMetrics().register_observable('scheduler', scheduler)

        scheduler.registered(self._driver, self._framework_id, object())
        scheduler.create_cluster("cluster1",
                                 "mysql_user",
                                 3,
                                 cluster_password='******')

        sample = RootMetrics().sample()
        assert sample['scheduler.cluster_count'] == 1
        assert sample[
            'scheduler.total_requested_mem_mb'] == DEFAULT_TASK_MEM.as_(
                Data.MB) * 3
        assert sample[
            'scheduler.total_requested_disk_mb'] == DEFAULT_TASK_DISK.as_(
                Data.MB) * 3
        assert sample[
            'scheduler.total_requested_cpus'] == DEFAULT_TASK_CPUS * 3

        scheduler.delete_cluster("cluster1", 'test_password')

        sample = RootMetrics().sample()
        assert sample['scheduler.cluster_count'] == 0
        assert sample['scheduler.total_requested_mem_mb'] == 0
        assert sample['scheduler.total_requested_disk_mb'] == 0
        assert sample['scheduler.total_requested_cpus'] == 0

    def test_scheduler_delete_empty_cluster(self):
        scheduler_key = gen_encryption_key()

        scheduler = MysosScheduler(self._state, self._state_provider,
                                   self._framework_user, "./executor.pex",
                                   "cmd.sh", self._zk_client, self._zk_url,
                                   Amount(5, Time.SECONDS),
                                   "/etc/mysos/admin_keyfile.yml",
                                   scheduler_key)

        scheduler.registered(self._driver, self._framework_id, object())
        _, password = scheduler.create_cluster("cluster1", "mysql_user", 3)

        assert len(scheduler._launchers) == 1

        # Deleting the cluster before any offer comes in for launching any task.
        scheduler.delete_cluster("cluster1", password)

        assert len(scheduler._launchers) == 0
Exemplo n.º 14
0
class TestLauncher(object):
  @pytest.fixture(params=[LocalStateProvider, ZooKeeperStateProvider], autouse=True)
  def setup(self, request):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = "framework_id_0"
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    # Enough memory and ports to fit three tasks.
    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    # Some tests use the default launcher; some don't.
    self._zk_url = "zk://host/mysos/test"

    self._scheduler_key = gen_encryption_key()
    self._password_box = PasswordBox(self._scheduler_key)

    self._cluster = MySQLCluster("cluster0", "user", self._password_box.encrypt("pass"), 3)

    # Construct the state provider based on the test parameter.
    if request.param == LocalStateProvider:
      tmpdir = tempfile.mkdtemp()
      self._state_provider = LocalStateProvider(tmpdir)
      request.addfinalizer(lambda: shutil.rmtree(tmpdir, True))  # Clean up after ourselves.
    elif request.param == ZooKeeperStateProvider:
      self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test")

    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))  # Short interval.

    self._elected = threading.Event()
    self._launchers = [self._launcher]  # See teardown().

    request.addfinalizer(self.teardown)

  def teardown(self):
    for launcher in self._launchers:
      if launcher._elector:
        launcher._elector.abort()  # Abort the thread even if the election is pending.
        launcher._elector.join()

  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

  def test_launch_cluster_insufficient_resources(self):
    """All but one slave in the slave are launched successfully."""
    del self._offer.resources[:]
    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001]))
    self._offer.resources.extend(resources)

    # There is one fewer port than required to launch the entire cluster.
    for i in range(self._cluster.num_nodes - 1):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes - 1

    # The final task cannot get launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1

    # The two nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes - 1):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # One slave.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
      "/mysos/test/cluster0/slaves/member_")]) == 1

  def test_two_launchers(self):
    """Two launchers share resources and launch their clusters successfully."""
    launchers = [
      MySQLClusterLauncher(
          self._driver,
          MySQLCluster("cluster0", "user0", self._password_box.encrypt("pass0"), 1),
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key),
      MySQLClusterLauncher(
          self._driver,
          MySQLCluster("cluster1", "user1", self._password_box.encrypt("pass1"), 2),
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key)]
    self._launchers.extend(launchers)

    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # Three nodes in total across two clusters.
    # Simulate the scheduler.
    for i in range(3):
      for launcher in launchers:
        task_id, remaining = launcher.launch(self._offer)
        if task_id:
          # Update the offer so other launchers will use its remaining resources.
          del self._offer.resources[:]
          self._offer.resources.extend(remaining)
          break

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == 3

  def test_invalid_status_update(self):
    """Launcher raises an exception when an invalid status is received."""
    self._cluster.num_nodes = 1
    launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key)
    self._launchers.append(launcher)

    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000]))
    self._offer.resources.extend(resources)

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-0"

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    status = mesos_pb2.TaskStatus()
    status.task_id.value = task_id
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    launcher.status_update(status)

    status.state = mesos_pb2.TASK_FINISHED  # An invalid state.

    with pytest.raises(MySQLClusterLauncher.Error):
      launcher.status_update(status)

  def test_terminal_status_update(self):
    """Launcher reacts to terminated task by launching a new one."""
    self._cluster.num_nodes = 1
    launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(1, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key)
    self._launchers.append(launcher)

    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000]))
    self._offer.resources.extend(resources)

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-0"

    launched = self._driver.method_calls["launchTasks"]
    assert len(launched) == self._cluster.num_nodes

    status = mesos_pb2.TaskStatus()
    status.task_id.value = task_id
    status.state = mesos_pb2.TASK_RUNNING
    launcher.status_update(status)

    assert len(launcher._cluster.running_tasks) == 1

    status.state = mesos_pb2.TASK_LOST
    launcher.status_update(status)

    assert len(launcher._cluster.running_tasks) == 0

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-1"

    launched = self._driver.method_calls["launchTasks"]

    # One task is relaunched to make up for the lost one.
    assert len(launched) == self._cluster.num_nodes + 1

  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(cpus=1, mem=512, ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1

  def test_launcher_recovery_after_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

  def test_launcher_kill(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Kill the cluster.
    with pytest.raises(MySQLClusterLauncher.PermissionError):
      self._launcher.kill("wrong_password")

    # Correct password.
    self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password))

    # All 3 nodes are successfully killed.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_KILLED
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
    assert not self._state_provider.load_cluster_state("cluster0")  # State removed.

  def test_launcher_recovery_corrupted_password(self):
    # 1. Launch a single instance for a cluster on the running launcher.
    task_id, remaining = self._launcher.launch(self._offer)
    del self._offer.resources[:]
    self._offer.resources.extend(remaining)
    assert task_id == "mysos-cluster0-0"

    # The task has successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    status.task_id.value = "mysos-cluster0-0"
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._cluster.encrypted_password = "******"

    # The corrupted password causes the launcher constructor to fail.
    with pytest.raises(ValueError):
      self._launcher = MySQLClusterLauncher(
          self._driver,
          self._cluster,
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key,
          query_interval=Amount(150, Time.MILLISECONDS))
Exemplo n.º 15
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        self.storage = FakeStorage(SequentialThreadingHandler())
        self.client = FakeClient(storage=self.storage)
        self.client.start()

    def tearDown(self):
        self.client.stop()

    def test_add_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        assert member1 == manager.add_member(
            instance1)  # Second insertion is ignored.

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        manager.add_member(instance2)

        assert len(manager._cluster.members) == 2

        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))

    def test_promote_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.promote_member(member)
        assert not manager.promote_member(
            member)  # The 2nd promotion is a no-op.

        assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance))

    def test_remove_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.remove_member(member)
        assert not manager.remove_member(
            member)  # The second deletion is ignored.

        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths

    def test_callbacks(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        # Set up 2 listeners.
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(self.client, "/home/my_cluster", instance1,
                                    handler1.promotion_callback,
                                    handler1.demotion_callback,
                                    handler1.master_callback,
                                    handler1.termination_callback)
        listener1.start()
        member1 = manager.add_member(instance1)

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        handler2 = CallbackHandler()
        listener2 = ClusterListener(self.client, "/home/my_cluster", instance2,
                                    handler2.promotion_callback,
                                    handler2.demotion_callback,
                                    handler2.master_callback)
        listener2.start()
        member2 = manager.add_member(instance2)

        # Test promotion.
        manager.promote_member(member1)

        assert handler1.promoted.wait(1)
        assert handler2.detected.get(True, 1) == instance1

        assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))

        manager.promote_member(member2)

        assert handler1.demoted.wait(1)
        assert handler2.promoted.wait(1)

        assert (self.storage.paths["/home/my_cluster/master/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))
        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths

        manager.remove_member(member2)
        assert handler2.demoted.wait(1)

        # Test removing cluster.
        manager.remove_member(member1)
        manager.delete_cluster()
        assert handler1.terminated.wait(1)

    def test_invalid_arguments(self):
        client = FakeClient()
        client.start()
        manager = ClusterManager(client, "/home/my_cluster")

        with pytest.raises(ValueError) as e:
            manager.promote_member("123")
        assert e.value.message == 'Invalid member_id: 123'

    def test_invalid_znode(self):
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(self.client, "/home/my_cluster", instance1,
                                    handler1.promotion_callback,
                                    handler1.demotion_callback,
                                    handler1.master_callback)
        listener1.start()

        self.client.ensure_path("/home/my_cluster/master")
        self.client.create("/home/my_cluster/master/member_",
                           "Invalid Data",
                           sequence=True)

        # Invalid ZNode data translates into a 'None' return.
        assert handler1.detected.get(True, 1) is None

    def test_existing_zk(self):
        """
      ClusterManager needs to be able to recover from an existing ZK group for scheduler failover.
    """
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]
                ["data"] == ServiceInstance.pack(instance1))
        assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]
                ["data"] == ServiceInstance.pack(instance2))

        manager.promote_member(member1)

        # Test the new ClusterManager.
        manager2 = ClusterManager(self.client, "/home/my_cluster")
        assert len(manager2._cluster.members) == 2
        assert member1 in manager2._cluster.members
        assert member2 in manager2._cluster.members
        assert manager2._cluster.members[member1] == ServiceInstance.pack(
            instance1)

    def test_remove_cluster(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        manager.promote_member(member1)

        with pytest.raises(ClusterManager.Error):
            manager.delete_cluster()

        manager.remove_member(member1)
        manager.remove_member(member2)
        manager.delete_cluster()

        assert "/home/my_cluster" not in self.storage.paths
Exemplo n.º 16
0
class TestZooKeeperStateProvider(unittest.TestCase):
  def setUp(self):
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._client = FakeClient(storage=self._storage)
    self._client.start()
    self._state_provider = ZooKeeperStateProvider(self._client, '/mysos')

  def tearDown(self):
    self._client.stop()

  def test_scheduler_state(self):
    expected = Scheduler(FrameworkInfo(
        user='******',
        name='test_fw_name',
        checkpoint=True))
    expected.tasks = dict(taks1='cluster1', task2='cluster2')

    self._state_provider.dump_scheduler_state(expected)
    actual = self._state_provider.load_scheduler_state()

    assert expected.framework_info == actual.framework_info
    assert expected.tasks == actual.tasks

  def test_scheduler_state_errors(self):
    assert not self._state_provider.load_scheduler_state()  # Not an error for scheduler state to be
                                                            # not found.

    self._client.ensure_path("/mysos/state")
    self._client.create("/mysos/state/scheduler", cPickle.dumps(object()))
    with pytest.raises(StateProvider.Error):
      self._state_provider.load_scheduler_state()

  def test_cluster_state(self):
    expected = MySQLCluster(
        'cluster1',
        'cluster_user',
        'cluster_password',
        3,
        DEFAULT_TASK_CPUS,
        DEFAULT_TASK_MEM,
        DEFAULT_TASK_DISK)

    expected.tasks['task1'] = MySQLTask(
        'cluster1', 'task1', 'slave1', 'host1', 10000)

    self._state_provider.dump_cluster_state(expected)
    actual = self._state_provider.load_cluster_state('cluster1')

    assert expected.user == actual.user
    assert isinstance(actual.num_nodes, int)
    assert expected.num_nodes == actual.num_nodes
    assert len(expected.tasks) == len(actual.tasks)
    assert expected.tasks['task1'].port == actual.tasks['task1'].port

  def test_cluster_state_errors(self):
    assert not self._state_provider.load_cluster_state('nonexistent')

    self._client.ensure_path("/mysos/state/clusters")
    self._client.create("/mysos/state/clusters/cluster1", cPickle.dumps(object()))
    with pytest.raises(StateProvider.Error):
      self._state_provider.load_cluster_state('cluster1')
Exemplo n.º 17
0
class TestScheduler(unittest.TestCase):
  def setUp(self):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._framework_id = mesos_pb2.FrameworkID()
    self._framework_id.value = "framework_id_0"

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = self._framework_id.value
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3,
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    self._zk_url = "zk://host/mysos/test"
    self._cluster = MySQLCluster(
        "cluster0", "user", "pass", 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK)

    self._tmpdir = tempfile.mkdtemp()
    self._state_provider = LocalStateProvider(self._tmpdir)

    framework_info = mesos_pb2.FrameworkInfo(
        user=getpass.getuser(),
        name="mysos",
        checkpoint=False)
    self._state = Scheduler(framework_info)

  def tearDown(self):
    shutil.rmtree(self._tmpdir, True)  # Clean up after ourselves.

  def test_scheduler_recovery(self):
    scheduler_key = gen_encryption_key()

    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)
    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)
    scheduler1.resourceOffers(self._driver, [self._offer])

    # One task is launched for one offer.
    assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1

    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler1.create_cluster("cluster1", "mysql_user", 3)

    # FrameworkID should have been persisted.
    self._state = self._state_provider.load_scheduler_state()
    assert self._state.framework_info.id.value == self._framework_id.value

    # Simulate restart.
    scheduler2 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)

    # Scheduler always receives registered() with the same FrameworkID after failover.
    scheduler2.registered(self._driver, self._framework_id, object())

    assert len(scheduler2._launchers) == 1
    assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

    # Scheduler has recovered the cluster so it doesn't accept another of the same name.
    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler2.create_cluster("cluster1", "mysql_user", 3)

  def test_scheduler_recovery_failure_before_launch(self):
    scheduler_key = gen_encryption_key()

    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)
    scheduler1.registered(self._driver, self._framework_id, object())
    _, password = scheduler1.create_cluster("cluster1", "mysql_user", 3)

    # Simulate restart before the task is successfully launched.
    scheduler2 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)

    assert len(scheduler2._launchers) == 0  # No launchers are recovered.

    # Scheduler always receives registered() with the same FrameworkID after failover.
    scheduler2.registered(self._driver, self._framework_id, object())

    assert len(scheduler2._launchers) == 1
    assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

    password_box = PasswordBox(scheduler_key)

    assert password_box.match(
        password, scheduler2._launchers["cluster1"]._cluster.encrypted_password)

    # Now offer the resources for this task.
    scheduler2.resourceOffers(self._driver, [self._offer])

    # One task is launched for the offer.
    assert len(scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1

    # Scheduler has recovered the cluster so it doesn't accept another of the same name.
    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler2.create_cluster("cluster1", "mysql_user", 3)

  def test_incompatible_resource_role(self):
    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        gen_encryption_key(),
        framework_role='mysos')  # Require 'mysos' but the resources are in '*'.

    RootMetrics().register_observable('scheduler', scheduler1)

    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)
    scheduler1.resourceOffers(self._driver, [self._offer])

    assert "declineOffer" in self._driver.method_calls
    assert len(self._driver.method_calls["declineOffer"]) == 1
    # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is
    # a 'Filters' object.
    assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds ==
        INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS))

    sample = RootMetrics().sample()
    assert sample['scheduler.offers_incompatible_role'] == 1

  def test_scheduler_metrics(self):
    scheduler_key = gen_encryption_key()

    scheduler = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)

    RootMetrics().register_observable('scheduler', scheduler)

    scheduler.registered(self._driver, self._framework_id, object())

    sample = RootMetrics().sample()
    assert sample['scheduler.framework_registered'] == 1

    scheduler.create_cluster(
        "cluster1", "mysql_user", 3, cluster_password='******')

    sample = RootMetrics().sample()
    assert sample['scheduler.cluster_count'] == 1
    assert sample['scheduler.total_requested_mem_mb'] == DEFAULT_TASK_MEM.as_(Data.MB) * 3
    assert sample['scheduler.total_requested_disk_mb'] == DEFAULT_TASK_DISK.as_(Data.MB) * 3
    assert sample['scheduler.total_requested_cpus'] == DEFAULT_TASK_CPUS * 3

    scheduler.resourceOffers(self._driver, [self._offer])
    sample = RootMetrics().sample()
    assert sample['scheduler.resource_offers'] == 1
    assert sample['scheduler.tasks_launched'] == 1

    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    status.task_id.value = 'mysos-cluster1-0'

    scheduler.statusUpdate(self._driver, status)

    status.state = mesos_pb2.TASK_FAILED
    scheduler.statusUpdate(self._driver, status)

    sample = RootMetrics().sample()
    assert sample['scheduler.tasks_failed'] == 1

    scheduler.delete_cluster("cluster1", 'test_password')

    sample = RootMetrics().sample()
    assert sample['scheduler.cluster_count'] == 0
    assert sample['scheduler.total_requested_mem_mb'] == 0
    assert sample['scheduler.total_requested_disk_mb'] == 0
    assert sample['scheduler.total_requested_cpus'] == 0

  def test_scheduler_delete_empty_cluster(self):
    scheduler_key = gen_encryption_key()

    scheduler = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        scheduler_key)

    scheduler.registered(self._driver, self._framework_id, object())
    _, password = scheduler.create_cluster("cluster1", "mysql_user", 3)

    assert len(scheduler._launchers) == 1

    # Deleting the cluster before any offer comes in for launching any task.
    scheduler.delete_cluster("cluster1", password)

    assert len(scheduler._launchers) == 0
class TestTaskRunner(unittest.TestCase):
  def setUp(self):
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._client = FakeClient(storage=self._storage)
    self._client.start()
    self._self_instance = ServiceInstance(Endpoint("host", 10000))
    self._state_manager = FakeStateManager()

  def tearDown(self):
    self._client.stop()

  def test_stop(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)
    runner.start()
    assert runner.stop()

    # Killed by SIGTERM.
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGTERM

  def test_demote(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    self_member = manager.add_member(self._self_instance)

    # 'self_instance' becomes the master.
    manager.promote_member(self_member)

    runner.promoted.wait(1)

    another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000)))

    # This demotes 'self_instance', which should cause runner to stop.
    manager.promote_member(another_member)

    assert deadline(runner.join, Amount(1, Time.SECONDS))

  def test_reparent(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    # Promote another instance.
    master = ServiceInstance(Endpoint("another_host", 10000))
    another_member = manager.add_member(master)
    manager.promote_member(another_member)

    assert runner.master.get(True, 1) == master

    assert runner.stop()
    assert deadline(runner.join, Amount(1, Time.SECONDS))

  def test_mysqld_error(self):
    task_control = FakeTaskControl(mysqld="exit 123")
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    runner.start()
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == 123

  def test_start_command_error(self):
    task_control = FakeTaskControl(start_cmd="exit 1")
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    with pytest.raises(TaskError) as e:
      runner.start()
    assert e.value.message.startswith("Failed to start MySQL task")

  def test_promote_command_error(self):
    task_control = FakeTaskControl(promote_cmd="exit 1")
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    self_member = manager.add_member(self._self_instance)

    # 'self_instance' becomes the master.
    manager.promote_member(self_member)

    runner.promoted.wait(1)

    with pytest.raises(TaskError) as e:
      runner.join()
    assert e.value.message.startswith("Failed to promote the slave")

  def test_get_log_position(self):
    task_control = FakeTaskControl(position=1)
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    runner.start()
    assert runner.get_log_position() == 1

  def test_get_log_position_error(self):
    task_control = FakeTaskControl(get_log_position_cmd="exit 1")
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    with pytest.raises(TaskError) as e:
      runner.get_log_position()
    assert (e.value.message ==
            "Unable to get the slave's log position: " +
            "Command 'exit 1' returned non-zero exit status 1")

  def test_stop_interminable(self):
    cmd = """trap "echo Trapped SIGTERM!" TERM
while :
do
  sleep 60
done
"""
    task_control = FakeTaskControl(mysqld=cmd)
    runner = MysosTaskRunner(
      self._self_instance,
      self._client,
      "/home/test/my_cluster",
      NoopPackageInstaller(),
      task_control,
      self._state_manager)

    task_control._mysqld = cmd
    runner.start()
    assert runner.stop(timeout=1)
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGKILL
Exemplo n.º 19
0
 def queue(self):
     client = FakeClient()
     client.start()
     yield ZKDelayDeadlineQueue(client, "/")
Exemplo n.º 20
0
 def setUp(self):
     self.storage = FakeStorage(SequentialThreadingHandler())
     self.client = FakeClient(storage=self.storage)
     self.client.start()
Exemplo n.º 21
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        self.storage = FakeStorage(SequentialThreadingHandler())
        self.client = FakeClient(storage=self.storage)
        self.client.start()

    def tearDown(self):
        self.client.stop()

    def test_add_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        assert member1 == manager.add_member(instance1)  # Second insertion is ignored.

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        manager.add_member(instance2)

        assert len(manager._cluster.members) == 2

        assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack(
            instance1
        )
        assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack(
            instance2
        )

    def test_promote_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.promote_member(member)
        assert not manager.promote_member(member)  # The 2nd promotion is a no-op.

        assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack(instance)

    def test_remove_member(self):
        manager = ClusterManager(self.client, "/home/my_cluster")
        instance = ServiceInstance(Endpoint("host", 10000))
        member = manager.add_member(instance)

        assert manager.remove_member(member)
        assert not manager.remove_member(member)  # The second deletion is ignored.

        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths

    def test_callbacks(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        # Set up 2 listeners.
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(
            self.client,
            "/home/my_cluster",
            instance1,
            handler1.promotion_callback,
            handler1.demotion_callback,
            handler1.master_callback,
            handler1.termination_callback,
        )
        listener1.start()
        member1 = manager.add_member(instance1)

        instance2 = ServiceInstance(Endpoint("host2", 10000))
        handler2 = CallbackHandler()
        listener2 = ClusterListener(
            self.client,
            "/home/my_cluster",
            instance2,
            handler2.promotion_callback,
            handler2.demotion_callback,
            handler2.master_callback,
        )
        listener2.start()
        member2 = manager.add_member(instance2)

        # Test promotion.
        manager.promote_member(member1)

        assert handler1.promoted.wait(1)
        assert handler2.detected.get(True, 1) == instance1

        assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack(
            instance1
        )
        assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack(
            instance2
        )

        manager.promote_member(member2)

        assert handler1.demoted.wait(1)
        assert handler2.promoted.wait(1)

        assert self.storage.paths["/home/my_cluster/master/member_0000000001"]["data"] == ServiceInstance.pack(
            instance2
        )
        assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths

        manager.remove_member(member2)
        assert handler2.demoted.wait(1)

        # Test removing cluster.
        manager.remove_member(member1)
        manager.delete_cluster()
        assert handler1.terminated.wait(1)

    def test_invalid_arguments(self):
        client = FakeClient()
        client.start()
        manager = ClusterManager(client, "/home/my_cluster")

        with pytest.raises(ValueError) as e:
            manager.promote_member("123")
        assert e.value.message == "Invalid member_id: 123"

    def test_invalid_znode(self):
        instance1 = ServiceInstance(Endpoint("host1", 10000))
        handler1 = CallbackHandler()
        listener1 = ClusterListener(
            self.client,
            "/home/my_cluster",
            instance1,
            handler1.promotion_callback,
            handler1.demotion_callback,
            handler1.master_callback,
        )
        listener1.start()

        self.client.ensure_path("/home/my_cluster/master")
        self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True)

        # Invalid ZNode data translates into a 'None' return.
        assert handler1.detected.get(True, 1) is None

    def test_existing_zk(self):
        """
      ClusterManager needs to be able to recover from an existing ZK group for scheduler failover.
    """
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack(
            instance1
        )
        assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack(
            instance2
        )

        manager.promote_member(member1)

        # Test the new ClusterManager.
        manager2 = ClusterManager(self.client, "/home/my_cluster")
        assert len(manager2._cluster.members) == 2
        assert member1 in manager2._cluster.members
        assert member2 in manager2._cluster.members
        assert manager2._cluster.members[member1] == ServiceInstance.pack(instance1)

    def test_remove_cluster(self):
        manager = ClusterManager(self.client, "/home/my_cluster")

        instance1 = ServiceInstance(Endpoint("host1", 10000))
        member1 = manager.add_member(instance1)
        instance2 = ServiceInstance(Endpoint("host2", 10000))
        member2 = manager.add_member(instance2)

        manager.promote_member(member1)

        with pytest.raises(ClusterManager.Error):
            manager.delete_cluster()

        manager.remove_member(member1)
        manager.remove_member(member2)
        manager.delete_cluster()

        assert "/home/my_cluster" not in self.storage.paths
Exemplo n.º 22
0
 def multiple_queues(self):
     client = FakeClient()
     client.start()
     yield [ZKDelayDeadlineQueue(client, "/") for _ in range(5)]
Exemplo n.º 23
0
 def setUp(self):
     self.storage = FakeStorage(SequentialThreadingHandler())
     self.client = FakeClient(storage=self.storage)
     self.client.start()
Exemplo n.º 24
0
class TestScheduler(unittest.TestCase):
    def setUp(self):
        self._driver = FakeDriver()
        self._storage = FakeStorage(SequentialThreadingHandler())
        self._zk_client = FakeClient(storage=self._storage)
        self._zk_client.start()

        self._framework_id = mesos_pb2.FrameworkID()
        self._framework_id.value = "framework_id_0"

        self._offer = mesos_pb2.Offer()
        self._offer.id.value = "offer_id_0"
        self._offer.framework_id.value = self._framework_id.value
        self._offer.slave_id.value = "slave_id_0"
        self._offer.hostname = "localhost"

        resources = create_resources(cpus=4,
                                     mem=512 * 3,
                                     ports=set([10000, 10001, 10002]))
        self._offer.resources.extend(resources)

        self._framework_user = "******"

        self._zk_url = "zk://host/mysos/test"
        self._cluster = MySQLCluster("cluster0", "user", "pass", 3)

        self._tmpdir = tempfile.mkdtemp()
        self._state_provider = LocalStateProvider(self._tmpdir)

        framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(),
                                                 name="mysos",
                                                 checkpoint=False)
        self._state = Scheduler(framework_info)

    def tearDown(self):
        shutil.rmtree(self._tmpdir, True)  # Clean up after ourselves.

    def test_scheduler_recovery(self):
        scheduler1 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml")
        scheduler1.registered(self._driver, self._framework_id, object())
        scheduler1.create_cluster("cluster1", "mysql_user", 3)
        scheduler1.resourceOffers(self._driver, [self._offer])

        # One task is launched for one offer.
        assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1

        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler1.create_cluster("cluster1", "mysql_user", 3)

        # FrameworkID should have been persisted.
        self._state = self._state_provider.load_scheduler_state()
        assert self._state.framework_info.id.value == self._framework_id.value

        # Simulate restart.
        scheduler2 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml")

        # Scheduler always receives registered() with the same FrameworkID after failover.
        scheduler2.registered(self._driver, self._framework_id, object())

        assert len(scheduler2._launchers) == 1
        assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

        # Scheduler has recovered the cluster so it doesn't accept another of the same name.
        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler2.create_cluster("cluster1", "mysql_user", 3)

    def test_scheduler_recovery_failure_before_launch(self):
        scheduler1 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml")
        scheduler1.registered(self._driver, self._framework_id, object())
        scheduler1.create_cluster("cluster1", "mysql_user", 3)

        # Simulate restart before the task is successfully launched.
        scheduler2 = MysosScheduler(self._state, self._state_provider,
                                    self._framework_user, "./executor.pex",
                                    "cmd.sh", self._zk_client, self._zk_url,
                                    Amount(5, Time.SECONDS),
                                    "/etc/mysos/admin_keyfile.yml")

        assert len(scheduler2._launchers) == 0  # No launchers are recovered.

        # Scheduler always receives registered() with the same FrameworkID after failover.
        scheduler2.registered(self._driver, self._framework_id, object())

        assert len(scheduler2._launchers) == 1
        assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

        # Now offer the resources for this task.
        scheduler2.resourceOffers(self._driver, [self._offer])

        # One task is launched for the offer.
        assert len(
            scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1

        # Scheduler has recovered the cluster so it doesn't accept another of the same name.
        with pytest.raises(MysosScheduler.ClusterExists):
            scheduler2.create_cluster("cluster1", "mysql_user", 3)

    def test_incompatible_resource_role(self):
        scheduler1 = MysosScheduler(
            self._state,
            self._state_provider,
            self._framework_user,
            "./executor.pex",
            "cmd.sh",
            self._zk_client,
            self._zk_url,
            Amount(5, Time.SECONDS),
            "/etc/mysos/admin_keyfile.yml",
            framework_role='mysos'
        )  # Require 'mysos' but the resources are in '*'.
        scheduler1.registered(self._driver, self._framework_id, object())
        scheduler1.create_cluster("cluster1", "mysql_user", 3)
        scheduler1.resourceOffers(self._driver, [self._offer])

        assert "declineOffer" in self._driver.method_calls
        assert len(self._driver.method_calls["declineOffer"]) == 1
        # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is
        # a 'Filters' object.
        assert (self._driver.method_calls["declineOffer"][0][0][1].
                refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(
                    Time.SECONDS))
Exemplo n.º 25
0
class TestScheduler(unittest.TestCase):
  def setUp(self):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._framework_id = mesos_pb2.FrameworkID()
    self._framework_id.value = "framework_id_0"

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = self._framework_id.value
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    self._zk_url = "zk://host/mysos/test"
    self._cluster = MySQLCluster("cluster0", "user", "pass", 3)

    self._tmpdir = tempfile.mkdtemp()
    self._state_provider = LocalStateProvider(self._tmpdir)

    framework_info = mesos_pb2.FrameworkInfo(
        user=getpass.getuser(),
        name="mysos",
        checkpoint=False)
    self._state = Scheduler(framework_info)

  def tearDown(self):
    shutil.rmtree(self._tmpdir, True)  # Clean up after ourselves.

  def test_scheduler_recovery(self):
    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml")
    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)
    scheduler1.resourceOffers(self._driver, [self._offer])

    # One task is launched for one offer.
    assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1

    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler1.create_cluster("cluster1", "mysql_user", 3)

    # FrameworkID should have been persisted.
    self._state = self._state_provider.load_scheduler_state()
    assert self._state.framework_info.id.value == self._framework_id.value

    # Simulate restart.
    scheduler2 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml")

    # Scheduler always receives registered() with the same FrameworkID after failover.
    scheduler2.registered(self._driver, self._framework_id, object())

    assert len(scheduler2._launchers) == 1
    assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

    # Scheduler has recovered the cluster so it doesn't accept another of the same name.
    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler2.create_cluster("cluster1", "mysql_user", 3)

  def test_scheduler_recovery_failure_before_launch(self):
    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml")
    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)

    # Simulate restart before the task is successfully launched.
    scheduler2 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml")

    assert len(scheduler2._launchers) == 0  # No launchers are recovered.

    # Scheduler always receives registered() with the same FrameworkID after failover.
    scheduler2.registered(self._driver, self._framework_id, object())

    assert len(scheduler2._launchers) == 1
    assert scheduler2._launchers["cluster1"].cluster_name == "cluster1"

    # Now offer the resources for this task.
    scheduler2.resourceOffers(self._driver, [self._offer])

    # One task is launched for the offer.
    assert len(scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1

    # Scheduler has recovered the cluster so it doesn't accept another of the same name.
    with pytest.raises(MysosScheduler.ClusterExists):
      scheduler2.create_cluster("cluster1", "mysql_user", 3)

  def test_incompatible_resource_role(self):
    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        framework_role='mysos')  # Require 'mysos' but the resources are in '*'.
    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)
    scheduler1.resourceOffers(self._driver, [self._offer])

    assert "declineOffer" in self._driver.method_calls
    assert len(self._driver.method_calls["declineOffer"]) == 1
    # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is
    # a 'Filters' object.
    assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds ==
        INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS))
 def setUp(self):
   self._storage = FakeStorage(SequentialThreadingHandler())
   self._client = FakeClient(storage=self._storage)
   self._client.start()
   self._self_instance = ServiceInstance(Endpoint("host", 10000))
   self._state_manager = FakeStateManager()
Exemplo n.º 27
0
def test_list_deploy_queue(mock_delay_deadline_queue_class, mock_kazoo_client):
    mock_request = mock.Mock()
    settings.system_paasta_config = mock.create_autospec(SystemPaastaConfig)
    mock_kazoo_client.return_value = FakeClient()

    available_service_instance = ServiceInstance(
        service="fake_service1",
        instance="fake_instance1",
        watcher="worker0",
        bounce_by=1577952000,
        wait_until=1577952000,
        enqueue_time=1577952000,
        bounce_start_time=1577952000,
        failures=1,
        processed_count=2,
    )
    unavailable_service_instance = ServiceInstance(
        service="fake_service2",
        instance="fake_instance2",
        watcher="worker1",
        bounce_by=1577952100,
        wait_until=1577952200,
        enqueue_time=1577952100,
        bounce_start_time=1577952100,
        failures=2,
        processed_count=3,
    )

    mock_delay_deadline_queue = mock_delay_deadline_queue_class.return_value
    mock_delay_deadline_queue.get_available_service_instances.return_value = [
        (mock.Mock(), available_service_instance)
    ]
    mock_delay_deadline_queue.get_unavailable_service_instances.return_value = [
        (mock.Mock(), mock.Mock(), unavailable_service_instance)
    ]

    output = deploy_queue.list_deploy_queue(mock_request)
    assert output == {
        "available_service_instances": [{
            "service": "fake_service1",
            "instance": "fake_instance1",
            "watcher": "worker0",
            "bounce_by": 1577952000,
            "wait_until": 1577952000,
            "enqueue_time": 1577952000,
            "bounce_start_time": 1577952000,
            "failures": 1,
            "processed_count": 2,
        }],
        "unavailable_service_instances": [{
            "service": "fake_service2",
            "instance": "fake_instance2",
            "watcher": "worker1",
            "bounce_by": 1577952100,
            "wait_until": 1577952200,
            "enqueue_time": 1577952100,
            "bounce_start_time": 1577952100,
            "failures": 2,
            "processed_count": 3,
        }],
    }
Exemplo n.º 28
0
class TestLauncher(object):
  @pytest.fixture(params=[LocalStateProvider, ZooKeeperStateProvider], autouse=True)
  def setup(self, request):
    self._driver = FakeDriver()
    self._storage = FakeStorage(SequentialThreadingHandler())
    self._zk_client = FakeClient(storage=self._storage)
    self._zk_client.start()

    self._offer = mesos_pb2.Offer()
    self._offer.id.value = "offer_id_0"
    self._offer.framework_id.value = "framework_id_0"
    self._offer.slave_id.value = "slave_id_0"
    self._offer.hostname = "localhost"

    # Enough resources to fit three tasks.
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3,
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    self._framework_user = "******"

    # Some tests use the default launcher; some don't.
    self._zk_url = "zk://host/mysos/test"

    self._scheduler_key = gen_encryption_key()
    self._password_box = PasswordBox(self._scheduler_key)

    self._cluster = MySQLCluster(
        "cluster0",
        "user",
        self._password_box.encrypt("pass"),
        3,
        DEFAULT_TASK_CPUS,
        DEFAULT_TASK_MEM,
        DEFAULT_TASK_DISK)

    # Construct the state provider based on the test parameter.
    if request.param == LocalStateProvider:
      tmpdir = tempfile.mkdtemp()
      self._state_provider = LocalStateProvider(tmpdir)
      request.addfinalizer(lambda: shutil.rmtree(tmpdir, True))  # Clean up after ourselves.
    elif request.param == ZooKeeperStateProvider:
      self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test")

    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))  # Short interval.

    self._elected = threading.Event()
    self._launchers = [self._launcher]  # See teardown().

    request.addfinalizer(self.teardown)

  def teardown(self):
    for launcher in self._launchers:
      if launcher._elector:
        launcher._elector.abort()  # Abort the thread even if the election is pending.
        launcher._elector.join()

  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

  def test_launch_cluster_insufficient_resources(self):
    """All but one slave in the slave are launched successfully."""
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB),  # 1mb less than required disk space.
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # There is one fewer port than required to launch the entire cluster.
    for i in range(self._cluster.num_nodes - 1):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes - 1

    # The final task cannot get launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1

    # The two nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes - 1):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # One slave.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
      "/mysos/test/cluster0/slaves/member_")]) == 1

  def test_two_launchers(self):
    """Two launchers share resources and launch their clusters successfully."""
    launchers = [
      MySQLClusterLauncher(
          self._driver,
          MySQLCluster(
              "cluster0",
              "user0",
              self._password_box.encrypt("pass0"),
              1,
              DEFAULT_TASK_CPUS,
              DEFAULT_TASK_MEM,
              DEFAULT_TASK_DISK),
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key),
      MySQLClusterLauncher(
          self._driver,
          MySQLCluster(
              "cluster1",
              "user1",
              self._password_box.encrypt("pass1"),
              2,
              DEFAULT_TASK_CPUS,
              DEFAULT_TASK_MEM,
              DEFAULT_TASK_DISK),
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key)]
    self._launchers.extend(launchers)

    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3,
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # Three nodes in total across two clusters.
    # Simulate the scheduler.
    for i in range(3):
      for launcher in launchers:
        task_id, remaining = launcher.launch(self._offer)
        if task_id:
          # Update the offer so other launchers will use its remaining resources.
          del self._offer.resources[:]
          self._offer.resources.extend(remaining)
          break

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == 3

  def test_invalid_status_update(self):
    """Launcher raises an exception when an invalid status is received."""
    self._cluster.num_nodes = 1
    launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key)
    self._launchers.append(launcher)

    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-0"

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    status = mesos_pb2.TaskStatus()
    status.task_id.value = task_id
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    launcher.status_update(status)

    status.state = mesos_pb2.TASK_FINISHED  # An invalid state.

    with pytest.raises(MySQLClusterLauncher.Error):
      launcher.status_update(status)

  def test_terminal_status_update(self):
    """Launcher reacts to terminated task by launching a new one."""
    self._cluster.num_nodes = 1
    launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(1, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key)
    self._launchers.append(launcher)

    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-0"

    launched = self._driver.method_calls["launchTasks"]
    assert len(launched) == self._cluster.num_nodes

    status = mesos_pb2.TaskStatus()
    status.task_id.value = task_id
    status.state = mesos_pb2.TASK_RUNNING
    launcher.status_update(status)

    assert len(launcher._cluster.running_tasks) == 1

    status.state = mesos_pb2.TASK_LOST
    launcher.status_update(status)

    assert len(launcher._cluster.running_tasks) == 0

    task_id, _ = launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-1"

    launched = self._driver.method_calls["launchTasks"]

    # One task is relaunched to make up for the lost one.
    assert len(launched) == self._cluster.num_nodes + 1

  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1

  def test_launcher_recovery_after_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

  def test_launcher_kill(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Kill the cluster.
    with pytest.raises(MySQLClusterLauncher.PermissionError):
      self._launcher.kill("wrong_password")

    # Correct password.
    self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password))

    # All 3 nodes are successfully killed.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_KILLED
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
    assert not self._state_provider.load_cluster_state("cluster0")  # State removed.

  def test_launcher_recovery_corrupted_password(self):
    # 1. Launch a single instance for a cluster on the running launcher.
    task_id, remaining = self._launcher.launch(self._offer)
    del self._offer.resources[:]
    self._offer.resources.extend(remaining)
    assert task_id == "mysos-cluster0-0"

    # The task has successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    status.task_id.value = "mysos-cluster0-0"
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._cluster.encrypted_password = "******"

    # The corrupted password causes the launcher constructor to fail.
    with pytest.raises(ValueError):
      self._launcher = MySQLClusterLauncher(
          self._driver,
          self._cluster,
          self._state_provider,
          self._zk_url,
          self._zk_client,
          self._framework_user,
          "./executor.pex",
          "cmd.sh",
          Amount(5, Time.SECONDS),
          "/etc/mysos/admin_keyfile.yml",
          self._scheduler_key,
          query_interval=Amount(150, Time.MILLISECONDS))
Exemplo n.º 29
0
 def setUp(self):
   self._storage = FakeStorage(SequentialThreadingHandler())
   self._client = FakeClient(storage=self._storage)
   self._client.start()
   self._state_provider = ZooKeeperStateProvider(self._client, '/mysos')
Exemplo n.º 30
0
def panoptes_mock_kazoo_client(**kwargs):
    return FakeClient()