示例#1
0
def test_gpu_e2e(ray_8_cpus_gpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (20, ['host-1:1']),
            (60, ['host-1:1', 'host-4:1', 'host-5:1']),
            (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = ElasticRayExecutor.create_settings(min_np=4,
                                                      max_np=4,
                                                      nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = ElasticRayExecutor(settings,
                                      gpus_per_slot=1,
                                      use_gpu=True,
                                      override_discovery=False)

        logger, training_fn = _create_training_function(iterations=100)
        executor.start()
        results = executor.run(training_fn)
        assert len(results) == 4

        events = ray.get(logger.fetch.remote())
        assert sum(int("started" in e) for e in events) == 4, events
        assert sum(int("finished" in e) for e in events) == 4, events
示例#2
0
def test_min_np(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:1']),
            (10, ['host-1:1', 'host-4:1', 'host-5:1']),
            (None, ['host-1:1', 'host-4:1', 'host-5:1', 'host-6:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = ElasticRayExecutor.create_settings(min_np=4,
                                                      max_np=4,
                                                      nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = ElasticRayExecutor(settings,
                                      cpus_per_slot=1,
                                      override_discovery=False)

        training_fn = _create_training_function(iterations=30)
        executor.start()
        trace = StatusCallback()
        results = executor.run(training_fn, callbacks=[trace])
        assert len(results) == 4

        events = trace.fetch()
        assert sum(int("started" in e) for e in events) == 4, events
        assert sum(int("finished" in e) for e in events) == 4, events