示例#1
0
def Center_loss_surgery(model):
    # broadcast parameters
    blobs = [
        'gpu_' + str(gpu_id) + '/center_feature'
        for gpu_id in range(cfg.NUM_GPUS)
    ]

    data = workspace.FetchBlob(blobs[0])
    for i, p in enumerate(blobs[1:]):
        with c2_utils.CudaScope(i + 1):
            workspace.FeedBlob(p, data)

    # sync parameters
    with c2_utils.CudaScope(0):
        gradients = [
            'gpu_' + str(gpu_id) + '/center_feature_g'
            for gpu_id in range(cfg.NUM_GPUS)
        ]
        if cfg.USE_NCCL:
            model.net.NCCLAllreduce(gradients, gradients)
        else:
            muji.Allreduce(model.net, gradients, reduced_affix='')

    with c2_utils.CudaScope(0):
        gradients = [
            'gpu_' + str(gpu_id) + '/center_feature_n_u'
            for gpu_id in range(cfg.NUM_GPUS)
        ]
        if cfg.USE_NCCL:
            model.net.NCCLAllreduce(gradients, gradients)
        else:
            muji.Allreduce(model.net, gradients, reduced_affix='')
示例#2
0
def build_data_parallel_model(model, single_gpu_build_func):
    if model.train:
        all_loss_gradients = {}  # Will include loss gradients from all GPUs
        # Build the model on each GPU with correct name and device scoping
        for gpu_id in range(cfg.NUM_GPUS):
            with core.NameScope('gpu_{}'.format(gpu_id)):
                with core.DeviceScope(muji.OnGPU(gpu_id)):
                    all_loss_gradients.update(
                        single_gpu_build_func(model))
        # Add backward pass on all GPUs
        model.AddGradientOperators(all_loss_gradients)
        if cfg.NUM_GPUS > 1:
            # Need to all-reduce the per-GPU gradients if training with more
            # than 1 GPU
            all_params = model.TrainableParams()
            assert len(all_params) % cfg.NUM_GPUS == 0, \
                'This should not happen.'
            # The model parameters are replicated on each GPU, get the number
            # distinct parameter blobs (i.e., the number of parameter blobs on
            # each GPU)
            params_per_gpu = int(len(all_params) / cfg.NUM_GPUS)
            with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)):
                # Iterate over distinct parameter blobs
                for i in range(params_per_gpu):
                    # Gradients from all GPUs for this parameter blob
                    gradients = [
                        model.param_to_grad[p]
                        for p in all_params[i::params_per_gpu]
                    ]
                    if len(gradients) > 0:
                        if cfg.USE_NCCL:
                            model.net.NCCLAllreduce(gradients, gradients)
                        else:
                            muji.Allreduce(
                                model.net, gradients, reduced_affix='')
        for gpu_id in range(cfg.NUM_GPUS):
            # After all-reduce, all GPUs perform SGD updates on their identical
            # params and gradients in parallel
            add_parameter_update_ops(model, gpu_id)
    else:
        # Testing only supports running on a single GPU
        with core.NameScope('gpu_{}'.format(cfg.ROOT_GPU_ID)):
            with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)):
                single_gpu_build_func(model)
def _add_allreduce_graph(model):
    """Construct the graph that performs Allreduce on the gradients."""
    # Need to all-reduce the per-GPU gradients if training with more than 1 GPU
    all_params = model.TrainableParams()
    assert len(all_params) % cfg.NUM_GPUS == 0
    # The model parameters are replicated on each GPU, get the number
    # distinct parameter blobs (i.e., the number of parameter blobs on
    # each GPU)
    params_per_gpu = int(len(all_params) / cfg.NUM_GPUS)
    with c2_utils.CudaScope(0):
        # Iterate over distinct parameter blobs
        for i in range(params_per_gpu):
            # Gradients from all GPUs for this parameter blob
            gradients = [
                model.param_to_grad[p] for p in all_params[i::params_per_gpu]
            ]
            if len(gradients) > 0:
                if cfg.USE_NCCL:
                    model.net.NCCLAllreduce(gradients, gradients)
                else:
                    muji.Allreduce(model.net, gradients, reduced_affix='')
示例#4
0
    def test_timings(self):
        for n in range(2, workspace.NumCudaDevices()):
            for in_place in [False, True]:
                xs = [np.random.randn(1e7).astype(np.float32)
                      for i in range(n)]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
                self.ws.run(net)
                net_time = benchmark(self.ws, net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(self.ws, vanilla)
                print("Speedup for NCCL: {:.2f}".format(
                    vanilla_time / net_time))
示例#5
0
    def test_timings(self):
        for n in range(2, workspace.NumCudaDevices()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    workspace.FeedBlob(inputs[i], xs[i],
                                       gpu_device(i).SerializeToString())
                workspace.RunNetOnce(net.Proto().SerializeToString())
                net_time = benchmark(net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))