def test_paddle_iterator_feed_ndarray(): from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator from nvidia.dali.plugin.paddle import feed_ndarray as feed_ndarray from paddle import fluid num_gpus = 1 batch_size = 100 pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus, data_paths=image_data_set), batch_size, num_gpus) for gpu_id in range(num_gpus): pipe = pipes[gpu_id] pipe.build() outs = pipe.run() out_data = outs[0].as_tensor() lod_tensor = fluid.core.LoDTensor() lod_tensor._set_dims(out_data.shape()) gpu_place = fluid.CUDAPlace(gpu_id) ptr = lod_tensor._mutable_data(gpu_place, fluid.core.VarDesc.VarType.FP32) arr = np.array(lod_tensor) feed_ndarray(out_data, ptr, cuda_stream = None) # Using DALI's internal stream np.testing.assert_equal(np.array(lod_tensor), outs[0].as_cpu().as_array()) lod_tensor2 = fluid.core.LoDTensor() lod_tensor2._set_dims(out_data.shape()) ptr2 = lod_tensor2._mutable_data(gpu_place, fluid.core.VarDesc.VarType.FP32) arr2 = np.array(lod_tensor2) feed_ndarray(out_data, ptr2, cuda_stream = 0) # Using default stream np.testing.assert_equal(np.array(lod_tensor2), outs[0].as_cpu().as_array())
def __iter__(self): for _ in range(self.__len__()): data, ratios = [], [] dali_data, dali_boxes, dali_labels, dali_before_pad, dali_img_ids, dali_attr = self.pipe.run( ) # convert images from dali tensors to pytorch data = feed_ndarray( dali_data.as_tensor(), torch.zeros(dali_data.as_tensor().shape(), dtype=torch.float, device=torch.device("cuda")), ) max_detections = max( *(dali_boxes[i].shape()[0] for i in range(len(dali_boxes))), 1) pyt_targets = -1 * torch.ones([len(dali_boxes), max_detections, 5]) # get image ids. only needed for evaluation img_ids = torch.tensor(dali_img_ids.as_array()) prior_size = dali_attr.as_cpu().as_array() # target has different size for each image so need to treat them separately for batch in range(self.batch_size): # Calculate image resize ratio to rescale boxes resized_size = dali_before_pad[batch].shape()[:2] # in this formulation to get true bbox you need to **multiply** prediction by ratio ratios.append(max(prior_size[batch]) / max(resized_size)) # Rescale boxes pyt_bbox = feed_ndarray(dali_boxes[batch], torch.zeros(dali_boxes[batch].shape())) num_dets = pyt_bbox.size(0) if num_dets > 0: pyt_bbox[:, 0::2] *= float(resized_size[1]) pyt_bbox[:, 1::2] *= float(resized_size[0]) pyt_targets[batch, :num_dets, :4] = pyt_bbox # Arrange labels in target tensor np_label = feed_ndarray( dali_labels[batch], torch.empty(dali_labels[batch].shape(), dtype=torch.int32)) # DALI CocoReader maps existing 90 classes to 80 unique classes. Need to map to 90 again # this is done by indexing numpy array of new (90) labels with old (80) labels pyt_label = torch.tensor( COCO_80_TO_90_ARR[np_label.squeeze().numpy()]) if num_dets > 0: pyt_label -= 1 # [0, 90] => [-1, 89]. Removes background pyt_targets[batch, :num_dets, 4] = pyt_label pyt_targets = pyt_targets.cuda(non_blocking=True) ratios = torch.tensor(ratios) if self.train: yield data, pyt_targets else: yield data, (pyt_targets, img_ids, ratios)
def __call__(self, input): # set data and run the pipeline self._pipe.set_data(input) out_pipe = self._pipe.run() # retrieve dali tensor d_images: nvidia.dali.backend_impl.TensorGPU = out_pipe[0].as_tensor() # create torch tensor header with expected size t_images = torch.empty( d_images.shape(), dtype=torch.uint8, device=self._device) # populate torch tensor with dali tensor to_pytorch.feed_ndarray(d_images, t_images) t_images = t_images.permute([0, 3, 1, 2]) return t_images
def test_pytorch_iterator_feed_ndarray(): from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator from nvidia.dali.plugin.pytorch import feed_ndarray as feed_ndarray import torch num_gpus = 1 batch_size = 100 pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus, data_paths=image_data_set), batch_size, num_gpus) for gpu_id in range(num_gpus): pipe = pipes[gpu_id] pipe.build() outs = pipe.run() out_data = outs[0].as_tensor() device = torch.device('cuda', gpu_id) arr = torch.zeros(out_data.shape(), dtype=torch.float32, device=device) feed_ndarray(out_data, arr, cuda_stream = torch.cuda.current_stream(device=device)) np.testing.assert_equal(arr.cpu().numpy(), outs[0].as_cpu().as_array())
def preprocess(self, pipe_out): image_list, boxes_list, labels_list, id_list = pipe_out batch = [] for i in range(len(image_list)): img = torch.empty(image_list[i].shape(), device="cuda", dtype=torch.uint8) feed_ndarray(image_list[i], img, self.stream) self.stream.synchronize() img = img.permute(2, 0, 1) / 255. img_id = torch.from_numpy(id_list.at(i)).cuda() boxes = torch.from_numpy(boxes_list.at(i)).cuda() labels = torch.from_numpy(labels_list.at(i) - 1).squeeze(1).cuda() tgt = {"image_id": img_id, "boxes": boxes, "labels": labels.long()} batch.append((img, tgt)) return batch
def _run_one_step(self, pipeline, data_batches, current_data_batch): p = pipeline outputs = p.share_outputs() device_id = p.device_id tensors = list() shapes = list() for out in outputs: tensors.append(out.as_tensor()) shapes.append(tensors[-1].shape()) if data_batches[current_data_batch] is None: torch_types = list() torch_devices = list() torch_gpu_device = torch.device('cuda', device_id) torch_cpu_device = torch.device('cpu') for i in range(len(outputs)): torch_types.append(to_torch_type[np.dtype(tensors[i].dtype())]) from nvidia.dali.backend import TensorGPU if type(tensors[i]) is TensorGPU: torch_devices.append(torch_gpu_device) else: torch_devices.append(torch_cpu_device) pyt_tensors = list() for i in range(len(outputs)): pyt_tensors.append( torch.zeros(shapes[i], dtype=torch_types[i], device=torch_devices[i])) data_batches[current_data_batch] = pyt_tensors else: pyt_tensors = data_batches[current_data_batch] for tensor, pyt_tensor in zip(tensors, pyt_tensors): feed_ndarray(tensor, pyt_tensor) p.release_outputs() p.schedule_run() return pyt_tensors
def test_mxnet_iterator_feed_ndarray(): from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator from nvidia.dali.plugin.mxnet import feed_ndarray as feed_ndarray import mxnet as mx num_gpus = 1 batch_size = 100 pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus, data_paths=image_data_set), batch_size, num_gpus) for gpu_id in range(num_gpus): pipe = pipes[gpu_id] pipe.build() outs = pipe.run() out_data = outs[0].as_tensor() with mx.Context(mx.gpu(gpu_id)): arr = mx.nd.zeros(out_data.shape(), dtype=np.float32) mx.base._LIB.MXNDArrayWaitToWrite(arr.handle) feed_ndarray(out_data, arr, cuda_stream = None) # Using DALI's internal stream np.testing.assert_equal(arr.asnumpy(), outs[0].as_cpu().as_array()) arr2 = mx.nd.zeros(out_data.shape(), dtype=np.float32) mx.base._LIB.MXNDArrayWaitToWrite(arr2.handle) feed_ndarray(out_data, arr2, cuda_stream = 0) # Using default stream np.testing.assert_equal(arr2.asnumpy(), outs[0].as_cpu().as_array())