예제 #1
0
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    runner = MirroredStrategyRunner(num_slots=2)
    assert runner.get_num_tasks() == 1
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [2]

    runner = MirroredStrategyRunner(num_slots=4)
    assert runner.get_num_tasks() == 1
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [4]

    runner = MirroredStrategyRunner(num_slots=6)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [3, 3]

    runner = MirroredStrategyRunner(num_slots=8)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [4, 4]
def test_spark_task_cuda_devices_env_support(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        return os.environ['CUDA_VISIBLE_DEVICES']

    for num_slots in [2, 3, 4]:
        runner = MirroredStrategyRunner(num_slots=num_slots)
        task_cuda_env = runner.run(train_fn)
        gpu_set = {int(i) for i in task_cuda_env.split(',')}
        assert len(gpu_set) == num_slots
        for gpu_id in gpu_set:
            assert gpu_id in [10, 11, 12, 13]
예제 #3
0
def test_cpu_training_with_gpus(num_workers, num_gpus_per_worker):
    def train_fn():
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    runner = MirroredStrategyRunner(num_slots=2, use_gpu=False)
    assert runner.get_num_tasks() == 2
    gpus_used_by_each_task = runner.run(train_fn)
    assert gpus_used_by_each_task == [0, 0]
def test_equal_gpu_allocation(num_workers, num_gpus_per_worker):
    def train_fn():
        import os
        from pyspark import BarrierTaskContext
        context = BarrierTaskContext.get()
        cuda_state = os.environ['CUDA_VISIBLE_DEVICES']
        if cuda_state:
            num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
        else:
            num_gpus = 0
        return [int(e) for e in context.allGather(str(num_gpus))]

    for num_slots in [2, 4, 6, 8]:
        runner = MirroredStrategyRunner(num_slots=num_slots)
        task_gpu_amount = int(
            runner.sc.getConf().get('spark.task.resource.gpu.amount'))
        expected_num_task = math.ceil(num_slots / task_gpu_amount)
        assert runner.get_num_tasks() == expected_num_task
        gpus_used_by_each_task = runner.run(train_fn)
        assert gpus_used_by_each_task == [(num_slots // expected_num_task) +
                                          (i < (num_slots % expected_num_task))
                                          for i in range(expected_num_task)]