def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None): blobs_group = list(viewvalues(model._device_grouped_blobs[param])) if model._device_type == caffe2_pb2.CUDA and use_nccl: model.NCCLAllreduce(blobs_group, blobs_group, control_input=control_input) return if model._device_type == caffe2_pb2.CUDA: p2p_access_pattern = workspace.GetCudaPeerAccessPattern() else: p2p_access_pattern = None def sumN(*dev_indices): """Create a Sum op for 2 or more blobs on different devices. Saves the result on the first device. Arguments: dev_indices -- a list of device indices, which can be translated into CUDA identifiers with model._devices """ devices = [model._devices[idx] for idx in dev_indices] blobs = [blobs_group[idx] for idx in dev_indices] for i, peer in enumerate(devices): if i == 0: continue # Skip the first device if p2p_access_pattern is not None and not p2p_access_pattern[ devices[0], peer]: # Copy from peer to d0 blobs[i] = model.Copy( blobs[i], 'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer)) device_opt = core.DeviceOption(model._device_type, devices[0]) with core.DeviceScope(device_opt): net.Sum(blobs, [blobs[0]], name='dpm') if len(devices) == 8: # Special tree reduction for 8 gpus, TODO generalize like in muji.py for j in range(4): sumN(j * 2, j * 2 + 1) for j in range(2): sumN(j * 4, j * 4 + 2) sumN(0, 4) elif len(devices) == 4: sumN(0, 1) sumN(2, 3) sumN(0, 2) else: sumN(*range(len(devices))) _Broadcast(devices, model, net, param)
def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None): """The general Allreduce interface that reroutes the function calls. """ if gpu_indices is None: gpu_indices = list(range(len(blobs))) if len(gpu_indices) != len(blobs): raise RuntimeError( "gpu_indices length and blobs length mismatch: %d vs %d" % (len(gpu_indices), len(blobs))) pattern = workspace.GetCudaPeerAccessPattern() if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]): return Allreduce2(net, blobs, reduced_affix, gpu_indices) elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]): return Allreduce4(net, blobs, reduced_affix, gpu_indices) elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all( pattern[:2, :2]) and np.all(pattern[2:4, 2:4]): return Allreduce4Group2(net, blobs, reduced_affix, gpu_indices) elif len(blobs) == 8 and pattern.shape[0] >= 8 and np.all(pattern[:8, :8]): return Allreduce8(net, blobs, reduced_affix, gpu_indices) else: return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
def testGetCudaPeerAccessPattern(self): pattern = workspace.GetCudaPeerAccessPattern() self.assertEqual(type(pattern), np.ndarray) self.assertEqual(pattern.ndim, 2) self.assertEqual(pattern.shape[0], pattern.shape[1]) self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
def testAllreduceWithFourGPUs(self): pattern = workspace.GetCudaPeerAccessPattern() if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]): self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4) else: print('Skipping allreduce with 4 gpus. Not peer access ready.')
def testAllreduceWithTwoGPUs(self): pattern = workspace.GetCudaPeerAccessPattern() if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]): self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2) else: print('Skipping allreduce with 2 gpus. Not peer access ready.')