示例#1
0
  def test_healthy_vm_list_client_cluster(self):
    # Arrange
    list_instances_map = {}
    instance_resp_map = {
        'fake-ig-' + c:
        gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16',
                                     '10.0.0.' + ip, 'RUNNING')
        for c, ip in zip('abcd', '0123')
    }
    compute_service = build_mock_compute_service(instance_resp_map,
                                                 list_instances_map)
    noop_tpu_service = build_mock_tpu_service({})
    self.mock_discovery.side_effect = build_mock_services_fn(
        compute_service, noop_tpu_service)

    # Act
    vms = ['fake-ig-a', 'fake-ig-b', 'fake-ig-c', 'fake-ig-d']
    cr = ClusterResolver(['fake-tpu'], vms=vms)
    vm_cluster = cr.get_client_workers()

    # Assert
    expected = [
        ClientWorker(
            internal_ip='10.0.0.' + ip,
            machine_type='n1-standard-16',
            zone='fake-zone',
            hostname='fake-ig-' + c) for c, ip in zip('abcd', '0123')
    ]
    self.assertCountEqual(expected, vm_cluster)
示例#2
0
  def test_healthy_sea_service_cluster(self):
    tpu_resp_map = {
        'fake-tpu-{}'.format(ip): gen_fake_tpu_entry(
            'v3-8', ['10.0.0.{}'.format(ip)],
            'fake-tpu-{}'.format(ip),
            'READY',
            'pytorch-nightly',
            health='HEALTHY') for ip in range(256)
    }
    noop_compute_service = build_mock_compute_service({}, {})
    tpu_service = build_mock_tpu_service(tpu_resp_map)
    self.mock_discovery.side_effect = build_mock_services_fn(
        noop_compute_service, tpu_service)

    tpus = list(tpu_resp_map.keys())
    cr = ClusterResolver(tpus)
    service_workers = cr.get_service_workers()

    expected = [
        ServiceWorker(
            internal_ip='10.0.0.{}'.format(ip),
            port='8470',
            machine_type='v3-8',
            zone='fake-zone',
            sw_version='pytorch-nightly') for ip in range(256)
    ]
    self.assertCountEqual(expected, service_workers)
示例#3
0
  def test_healthy_cluster(self):
    list_instances_map = {
        'fake-ig': {
            'kind':
                'compute#instanceGroupsListInstances',
            'items': [
                gen_fake_ig_list_instances_entry('fake-ig-' + c, 'RUNNING')
                for c in 'abcd'
            ],
        },
    }
    instance_resp_map = {
        'fake-ig-' + c:
        gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16',
                                     '10.0.0.' + ip, 'RUNNING')
        for c, ip in zip('abcd', '0123')
    }
    compute_service = build_mock_compute_service(instance_resp_map,
                                                 list_instances_map)

    tpu_resp_map = {
        'fake-pod':
            gen_fake_tpu_entry(
                'v3-32', ['10.0.0.{}'.format(ip) for ip in range(4)],
                'fake-pod',
                'READY',
                'pytorch-nightly',
                health='HEALTHY'),
    }
    tpu_service = build_mock_tpu_service(tpu_resp_map)
    self.mock_discovery.side_effect = build_mock_services_fn(
        compute_service, tpu_service)

    tpus = list(tpu_resp_map.keys())
    cr = ClusterResolver(tpus)
    cluster = cr.get_cluster()

    expected_client_workers = [
        ClientWorker(
            internal_ip='10.0.0.' + ip,
            machine_type='n1-standard-16',
            zone='fake-zone',
            hostname='fake-ig-' + c) for c, ip in zip('abcd', '0123')
    ]
    expected_service_workers = [
        ServiceWorker(
            internal_ip='10.0.0.{}'.format(ip),
            port='8470',
            machine_type='v3-32',
            zone='fake-zone',
            sw_version='pytorch-nightly') for ip in range(4)
    ]
    expected = Cluster(expected_client_workers, expected_service_workers)
    self.assertEqual(expected, cluster)
示例#4
0
    def test_empty_instance_group_client_cluster(self):
        list_instances_map = {
            'fake-ig': {
                'kind': 'compute#instanceGroupsListInstances',
                'items': [],
            },
        }
        instance_resp_map = {
            'fake-ig-a':
            gen_fake_instances_get_entry('fake-ig-a', 'n1-standard-16',
                                         '10.0.0.0', 'RUNNING'),
        }
        compute_service = build_mock_compute_service(instance_resp_map,
                                                     list_instances_map)
        noop_tpu_service = build_mock_tpu_service({})
        self.mock_discovery.side_effect = build_mock_services_fn(
            compute_service, noop_tpu_service)

        # Act
        cr = ClusterResolver(['fake-tpu'])

        # Assert
        self.assertRaisesRegex(RuntimeError,
                               '.*vms is empty in instance group.*',
                               cr._get_client_workers)
示例#5
0
    def test_non_ready_sea_service_cluster(self):
        tpu_resp_map = {
            'fake-tpu-0':
            gen_fake_tpu_entry('v3-8', ['10.0.0.0'],
                               'fake-tpu-0',
                               'READY',
                               'pytorch-nightly',
                               health='HEALTHY'),
            'fake-tpu-1':
            gen_fake_tpu_entry('v3-8', ['10.0.0.1'],
                               'fake-tpu-1',
                               'READY',
                               'pytorch-nightly',
                               health='HEALTHY'),
            'fake-tpu-2':
            gen_fake_tpu_entry('v3-8', ['10.0.0.2'], 'fake-tpu-2', 'CREATING',
                               'pytorch-nightly'),
            'fake-tpu-3':
            gen_fake_tpu_entry('v3-8', ['10.0.0.3'],
                               'fake-tpu-3',
                               'READY',
                               'pytorch-nightly',
                               health='HEALTHY'),
        }
        noop_compute_service = build_mock_compute_service({}, {})
        tpu_service = build_mock_tpu_service(tpu_resp_map)
        self.mock_discovery.side_effect = build_mock_services_fn(
            noop_compute_service, tpu_service)

        tpus = list(tpu_resp_map.keys())
        cr = ClusterResolver(tpus)
        self.assertRaisesRegex(RuntimeError,
                               'TPU fake-tpu-2 is not READY yet.*',
                               cr.get_service_workers)
示例#6
0
 def get_tpu_fn(*args, **kwargs):
     node_name = ClusterResolver._parse_resource_url(
         kwargs['name'], 'nodes')
     resp = get_tpu_resp[node_name]
     get_node = mock.MagicMock()
     get_node.execute.return_value = resp
     return get_node
示例#7
0
    def test_unknown_health_pod_service_cluster(self):
        tpu_resp_map = {
            'fake-pod':
            gen_fake_tpu_entry('v3-32',
                               ['10.0.0.{}'.format(ip) for ip in range(4)],
                               'fake-pod', 'READY', 'pytorch-nightly'),
        }
        noop_compute_service = build_mock_compute_service({}, {})
        tpu_service = build_mock_tpu_service(tpu_resp_map)
        self.mock_discovery.side_effect = build_mock_services_fn(
            noop_compute_service, tpu_service)

        tpus = list(tpu_resp_map.keys())
        cr = ClusterResolver(tpus)
        self.assertRaisesRegex(RuntimeError,
                               'TPU fake-pod is not HEALTHY yet.*',
                               cr.get_service_workers)
示例#8
0
    def test_unhealthy_client_cluster(self):
        # Arrange
        list_instances_map = {
            'fake-ig': {
                'kind':
                'compute#instanceGroupsListInstances',
                'items': [
                    gen_fake_ig_list_instances_entry('fake-ig-a', 'RUNNING'),
                    gen_fake_ig_list_instances_entry('fake-ig-b',
                                                     'PROVISIONING'),
                    gen_fake_ig_list_instances_entry('fake-ig-c', 'RUNNING'),
                    gen_fake_ig_list_instances_entry('fake-ig-d', 'RUNNING'),
                ],
            },
        }
        instance_resp_map = {
            'fake-ig-a':
            gen_fake_instances_get_entry('fake-ig-a', 'n1-standard-16',
                                         '10.0.0.0', 'RUNNING'),
            'fake-ig-b':
            gen_fake_instances_get_entry('fake-ig-b', 'n1-standard-16',
                                         '10.0.0.1', 'PROVISIONING'),
            'fake-ig-c':
            gen_fake_instances_get_entry('fake-ig-c', 'n1-standard-16',
                                         '10.0.0.2', 'RUNNING'),
            'fake-ig-d':
            gen_fake_instances_get_entry('fake-ig-d', 'n1-standard-16',
                                         '10.0.0.3', 'RUNNING'),
        }
        compute_service = build_mock_compute_service(instance_resp_map,
                                                     list_instances_map)
        noop_tpu_service = build_mock_tpu_service({})
        self.mock_discovery.side_effect = build_mock_services_fn(
            compute_service, noop_tpu_service)

        # Act
        cr = ClusterResolver(['fake-tpu'])

        # Assert
        self.assertRaisesRegex(RuntimeError,
                               'Instance fake-ig-b is not running yet.*',
                               cr._get_client_workers)
示例#9
0
  def test_bad_cluster(self):
    list_instances_map = {
        'fake-ig': {
            'kind':
                'compute#instanceGroupsListInstances',
            'items': [
                gen_fake_ig_list_instances_entry('fake-ig-' + c, 'RUNNING')
                for c in 'abc'
            ],
        },
    }
    instance_resp_map = {
        'fake-ig-' + c:
        gen_fake_instances_get_entry('fake-ig-' + c, 'n1-standard-16',
                                     '10.0.0.' + ip, 'RUNNING')
        for c, ip in zip('abcd', '0123')
    }
    compute_service = build_mock_compute_service(instance_resp_map,
                                                 list_instances_map)

    tpu_resp_map = {
        'fake-pod':
            gen_fake_tpu_entry(
                'v3-32', ['10.0.0.{}'.format(ip) for ip in range(4)],
                'fake-pod',
                'READY',
                'pytorch-nightly',
                health='HEALTHY'),
    }
    tpu_service = build_mock_tpu_service(tpu_resp_map)
    self.mock_discovery.side_effect = build_mock_services_fn(
        compute_service, tpu_service)

    tpus = list(tpu_resp_map.keys())
    cr = ClusterResolver(tpus)
    self.assertRaisesRegex(
        RuntimeError,
        'The client_workers and service_workers must have a 1:1 mapping',
        cr.get_cluster)