Пример #1
0
    def testFailedWorkerPod(self):
        """
        Start a pod running a python program destined to fail with
        restart_policy="Never" to test failed_worker_count
        """
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        task_d.recover_tasks = MagicMock()
        instance_manager = InstanceManager(
            task_d,
            job_name="test-failed-worker-pod-%d-%d"
            % (int(time.time()), random.randint(1, 101)),
            image_name="gcr.io/google-samples/hello-app:1.0",
            worker_command=["badcommand"],
            worker_args=["badargs"],
            namespace="default",
            num_workers=3,
            restart_policy="Never",
        )
        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if counters["Failed"] == 3:
                break

        instance_manager.stop_relaunch_and_remove_workers()
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if not counters:
                break
        task_d.recover_tasks.assert_has_calls(
            [call(0), call(1), call(2)], any_order=True
        )
Пример #2
0
    def testCreateDeleteWorkerPod(self):
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        task_d.recover_tasks = MagicMock()
        instance_manager = InstanceManager(
            task_d,
            job_name="test-create-worker-pod-%d-%d"
            % (int(time.time()), random.randint(1, 101)),
            image_name="gcr.io/google-samples/hello-app:1.0",
            worker_command=["echo"],
            worker_args=[],
            namespace="default",
            num_workers=3,
        )

        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if counters["Succeeded"] == 3:
                break

        instance_manager.stop_relaunch_and_remove_workers()
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if not counters:
                break
        task_d.recover_tasks.assert_has_calls(
            [call(0), call(1), call(2)], any_order=True
        )
    def test_create_delete_worker_pod(self):
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        task_d.recover_tasks = MagicMock()
        instance_manager = InstanceManager(
            task_d,
            job_name="test-create-worker-pod-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            worker_command=["/bin/bash"],
            worker_args=["-c", "echo"],
            namespace="default",
            num_workers=3,
        )

        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if counters["Succeeded"] == 3:
                break

        instance_manager.stop_relaunch_and_remove_workers()
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if not counters:
                break
        self.assertFalse(counters)
    def test_relaunch_worker_pod(self):
        num_workers = 3
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        instance_manager = InstanceManager(
            task_d,
            job_name="test-relaunch-worker-pod-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            worker_command=["/bin/bash"],
            worker_args=["-c", "sleep 10 #"],
            namespace="default",
            num_workers=num_workers,
        )

        instance_manager.start_workers()

        max_check_num = 60
        for _ in range(max_check_num):
            time.sleep(1)
            counters = instance_manager.get_worker_counter()
            if counters["Running"] + counters["Pending"] > 0:
                break
        # Note: There is a slight chance of race condition.
        # Hack to find a worker to remove
        current_workers = set()
        live_workers = set()
        with instance_manager._lock:
            for (
                    k,
                (_, _, phase),
            ) in instance_manager._worker_pods_ip_phase.items():
                current_workers.add(k)
                if phase in ["Running", "Pending"]:
                    live_workers.add(k)
        self.assertTrue(live_workers)

        instance_manager._remove_worker(live_workers.pop())
        # verify a new worker get launched
        found = False
        for _ in range(max_check_num):
            if found:
                break
            time.sleep(1)
            with instance_manager._lock:
                for k in instance_manager._worker_pods_ip_phase:
                    if k not in range(num_workers, num_workers * 2):
                        found = True
        else:
            self.fail("Failed to find newly launched worker.")

        instance_manager.stop_relaunch_and_remove_workers()
    def test_get_worker_addrs(self):
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        instance_manager = InstanceManager(
            task_d,
            job_name="test-create-worker-pod-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            worker_command=["/bin/bash"],
            worker_args=["-c", "sleep 5 #"],
            namespace="default",
            num_workers=3,
        )

        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if counters["Running"]:
                worker_addrs = instance_manager._get_alive_worker_addr()
                self.assertEqual(len(worker_addrs), counters["Running"])

        instance_manager.stop_relaunch_and_remove_workers()