def make_pipe(): image = fn.external_source(source=image_gen) if device == 'gpu': image = image.gpu() axes, shape = fn.external_source(source=get_dynamic_axes, num_outputs=2) fill_value = fn.random.uniform(device='cpu', range=[0.0, 255.0]) pad1 = fn.pad(image, axes=axes, fill_value=fill_value) pad2 = fn.pad(image, axes=axes, shape=shape, fill_value=fill_value) return image, axes, shape, pad1, pad2, fill_value
def check_pad_per_sample_shapes_and_alignment(device='cpu', batch_size=3, ndim=2, num_iter=3): pipe = Pipeline(batch_size=batch_size, num_threads=3, device_id=0, seed=1234) axes = (0, 1) with pipe: in_shape = fn.cast(fn.random.uniform(range=(10, 20), shape=(ndim, )), dtype=types.INT32) in_data = fn.random.uniform(range=(0., 1.), shape=in_shape) if device == 'gpu': in_data = in_data.gpu() req_shape = fn.cast(fn.random.uniform(range=(21, 30), shape=(ndim, )), dtype=types.INT32) req_align = fn.cast(fn.random.uniform(range=(3, 5), shape=(ndim, )), dtype=types.INT32) out_pad_shape = fn.pad(in_data, axes=axes, align=None, shape=req_shape) out_pad_align = fn.pad(in_data, axes=axes, align=req_align, shape=None) out_pad_both = fn.pad(in_data, axes=axes, align=req_align, shape=req_shape) pipe.set_outputs(in_shape, in_data, req_shape, req_align, out_pad_shape, out_pad_align, out_pad_both) pipe.build() for _ in range(num_iter): outs = [ out.as_cpu() if isinstance(out, TensorListGPU) else out for out in pipe.run() ] for i in range(batch_size): in_shape, in_data, req_shape, req_align, out_pad_shape, out_pad_align, out_pad_both = \ [outs[out_idx].at(i) for out_idx in range(len(outs))] assert (in_shape == in_data.shape).all() # Pad to explicit shape assert (out_pad_shape.shape >= in_shape).all() assert (req_shape == out_pad_shape.shape).all() # Alignment only assert (out_pad_align.shape >= in_shape).all() assert is_aligned(out_pad_align.shape, req_align, axes) # Explicit shape + alignment assert (out_pad_both.shape >= in_shape).all() assert (req_shape <= out_pad_both.shape).all() assert is_aligned(out_pad_both.shape, req_align, axes)
def many_input_pipeline(def_for_dataset, device, sources, input_names, batches): """ Pipeline accepting multiple inputs via external source Parameters ---------- def_for_dataset : bool True if this pipeline will be converted to TF Dataset device : str device that the Dataset will be placed ("cpu" or "gpu") sources : list of callables callbacks for the external sources in baseline pipeline otherwise None input_names : list of str Names of inputs placeholder for TF """ inputs = [] if def_for_dataset: for input_name, batch in zip(input_names, batches): if batch == "dataset": # Special value used in tests, reroute it to the default batch = None input = fn.external_source(name=input_name, batch=batch) input = input if device == 'cpu' else input.gpu() inputs.append(input) else: for source in sources: input = fn.external_source(source=source, batch=False) input = input if device == 'cpu' else input.gpu() inputs.append(input) processed = [] for input in inputs: processed.append(fn.cast(input + 10, dtype=dali.types.INT32)) results = fn.pad(inputs + processed) return tuple(results)
def test_tf_experimental_source_disabled(): pipe = Pipeline(10, 4, 0) with pipe: input = fn.external_source(source=lambda: np.full((4, 4), 0), batch=False) pipe.set_outputs(fn.pad(input)) dali_tf.DALIDataset(pipe, output_dtypes=tf.int32)
def make_pipe(): fake_data = fn.constant(idata=0, shape=[10, 10, 3], dtype=types.FLOAT, device=device) axes = fn.random.uniform(range=wrong_axes_range, shape=(2, ), dtype=types.INT32) padded = fn.pad(fake_data, axes=axes) return padded
def get_pipeline_desc(batch_size, num_threads, device, device_id, shard_id, num_shards, def_for_dataset): pipe = Pipeline(batch_size, num_threads, device_id) with pipe: # Our callbacks may have state, to be able to run it twice, once in Dataset and once # with baseline test, we need to make a copy to preserve that state. es = fn.external_source(device=es_device, **copy.deepcopy(es_args)) if device == "gpu" and es_device == "cpu": es = es.gpu() pad = fn.pad(es, device=device) pipe.set_outputs(pad) return pipe, None, dtype
def setup_dali( image_file='/mnt/data/DATASETS/samples/images/image_110.jpg', image_dim=[800, 1600], batch_size=1, num_threads=4, device='mixed', device_id=0, output_dir='./out/', ): os.makedirs(os.path.dirname(output_dir), exist_ok=True) pipeline = dali.pipeline.Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id) with pipeline: data, _ = fn.file_reader(files=[image_file]) # image preprocess images = fn.image_decoder(data, device=device) images = fn.resize(images, size=image_dim, mode="not_larger", max_size=image_dim) images = fn.pad(images, fill_value=0, shape=[image_dim[0], image_dim[1], 1]) images = fn.transpose(images, perm=[2, 0, 1]) images = fn.cast(images, dtype=dali.types.FLOAT) images = images / 255. # input shape input_shape = np.float32((image_dim[0], image_dim[1], 1)) # original shape shapes = fn.peek_image_shape(data) shapes = fn.cast(shapes, dtype=dali.types.FLOAT) # gather outputs out = [images, input_shape, shapes] pipeline.set_outputs(*out) pipeline.build() output = pipeline.run() img = output[0].at(0) if device == 'cpu' else output[0].as_cpu().at(0) img = img.transpose(1, 2, 0) # HWC img = img[:, :, ::-1] # BGR print(img) quit() cv2.imwrite(os.path.join(output_dir, 'dali_image.jpg'), img)
def test_pad_cpu(): pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=None) test_data_shape = [5, 4, 3] def get_data(): out = [ np.random.randint(0, 255, size=test_data_shape, dtype=np.uint8) for _ in range(batch_size) ] return out data = fn.external_source(source=get_data, layout="HWC") processed = fn.pad(data, fill_value=-1, axes=(0, ), shape=(10, )) pipe.set_outputs(processed) pipe.build() for _ in range(3): pipe.run()
def check_layout(kwargs, input_datasets, layout): pipe = Pipeline(10, 4, 0) with pipe: input = fn.external_source(**kwargs) # Rely on the Pad internal check to ensure that External Source set layout pipe.set_outputs(fn.pad(input, axis_names=layout)) with tf.device('/cpu:0'): dali_dataset = dali_tf.experimental.DALIDatasetWithInputs( input_datasets=input_datasets, pipeline=pipe, batch_size=pipe.max_batch_size, output_shapes=None, output_dtypes=tf.int64, num_threads=pipe.num_threads, device_id=pipe.device_id) run_dataset_eager_mode(dali_dataset, 10)
def setup_dali( input_name='DALI_INPUT_0', image_dim=[896, 1536], batch_size=1, num_threads=4, device='cpu', device_id=0, output_dir='./out/', ): pipeline = dali.pipeline.Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id) with pipeline: data = fn.external_source(name=input_name, device="cpu") # image preprocess images = fn.image_decoder(data, device=device) images = fn.resize(images, size=image_dim, mode="not_larger", max_size=image_dim) images = fn.pad(images, fill_value=0, shape=[image_dim[0], image_dim[1], 1]) images = fn.transpose(images, perm=[2, 0, 1]) images = fn.cast(images, dtype=dali.types.FLOAT) images = images / 255. # input shape input_shape = np.float32((image_dim[0], image_dim[1], 1)) # original shape shapes = fn.peek_image_shape(data) shapes = fn.cast(shapes, dtype=dali.types.FLOAT) # gather outputs out = [images, input_shape, shapes] pipeline.set_outputs(*out) os.makedirs(os.path.dirname(output_dir), exist_ok=True) pipeline.serialize(filename=os.path.join(output_dir, 'model.dali'))
def one_input_pipeline(def_for_dataset, device, source, external_source_device, no_copy, batch): """Pipeline accepting single input via external source Parameters ---------- def_for_dataset : bool True if this pipeline will be converted to TF Dataset device : str device that the Dataset will be placed ("cpu" or "gpu") source : callable callback for the external source in baseline pipeline otherwise None external_source_device : str Device that we want the external source in TF dataset to be placed """ if def_for_dataset: if no_copy is None: # If no_copy is None, we infer it automatically and we use no_copy=True when # the input memory is matching the external source placement, # so the Dataset's placement is the same as external source's device, # otherwise for cross-backend we use False. no_copy = (device == external_source_device) if batch == "dataset": # Special value used in tests, reroute it to the default batch = None input = fn.external_source(name="input_placeholder", no_copy=no_copy, device=external_source_device, batch=batch) else: input = fn.external_source(name="actual_input", source=source, batch=False, device=external_source_device) input = input if device == 'cpu' else input.gpu() processed = fn.cast(input + 10, dtype=dali.types.INT32) input_padded, processed_padded = fn.pad([input, processed]) return input_padded, processed_padded
def check_pad_to_square(device='cpu', batch_size=3, ndim=2, num_iter=3): pipe = Pipeline(batch_size=batch_size, num_threads=3, device_id=0, seed=1234) axes = (0, 1) with pipe: in_shape = fn.cast(fn.random.uniform(range=(10, 20), shape=(ndim, )), dtype=types.INT32) in_data = fn.reshape(fn.random.uniform(range=(0., 1.), shape=in_shape), layout="HW") shape = fn.shapes(in_data, dtype=types.INT32) h = fn.slice(shape, 0, 1, axes=[0]) w = fn.slice(shape, 1, 1, axes=[0]) side = math.max(h, w) if device == 'gpu': in_data = in_data.gpu() out_data = fn.pad(in_data, axis_names="HW", shape=fn.cat(side, side, axis=0)) pipe.set_outputs(in_data, out_data) pipe.build() for _ in range(num_iter): outs = [ out.as_cpu() if isinstance(out, TensorListGPU) else out for out in pipe.run() ] for i in range(batch_size): in_data, out_data = \ [outs[out_idx].at(i) for out_idx in range(len(outs))] in_shape = in_data.shape max_side = max(in_shape) for s in out_data.shape: assert s == max_side np.testing.assert_equal(out_data[:in_shape[0], :in_shape[1]], in_data) np.testing.assert_equal(out_data[in_shape[0]:, :], 0) np.testing.assert_equal(out_data[:, in_shape[1]:], 0)
def dali_asr_pipeline(train_pipeline, # True if training, False if validation file_root, file_list, sample_rate, silence_threshold, resample_range, discrete_resample_range, window_size, window_stride, nfeatures, nfft, frame_splicing_factor, dither_coeff, pad_align, preemph_coeff, do_spectrogram_masking=False, cutouts_generator=None, shard_id=0, n_shards=1, preprocessing_device="gpu"): do_remove_silence = silence_threshold is not None def _div_ceil(dividend, divisor): return (dividend + (divisor - 1)) // divisor encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [resample_range[0], 1.0, resample_range[1]] speed_perturbation_coeffs = fn.random.uniform(device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform(device="cpu", range=resample_range) if train_pipeline and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate elif resample_range is None: dec_sample_rate_arg = sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if do_remove_silence: begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if preprocessing_device == "gpu": audio = audio.gpu() if dither_coeff != 0.: audio = audio + fn.random.normal(audio) * dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if frame_splicing_factor != 1: log_features_len = _div_ceil(log_features_len, frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if train_pipeline and do_spectrogram_masking: anchors, shapes = fn.external_source(source=cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` # in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()
def dali_jasper_pipe(): if is_triton_pipeline: assert not self.train, "Pipeline for Triton shall be a validation pipeline" if torch.distributed.is_initialized(): raise RuntimeError( "You're creating Triton pipeline, using multi-process mode. Please use single-process mode." ) encoded, label = fn.external_source(device="cpu", name="DALI_INPUT_0", no_copy=True) else: encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [ self.resample_range[0], 1.0, self.resample_range[1] ] speed_perturbation_coeffs = fn.random.uniform( device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform( device="cpu", range=resample_range) if self.train and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * self.sample_rate elif resample_range is None: dec_sample_rate_arg = self.sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if self.do_remove_silence: begin, length = fn.nonsilent_region( audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if self.preprocessing_device == "gpu": audio = audio.gpu() if self.dither_coeff != 0.: audio = audio + fn.random.normal( device=preprocessing_device) * self.dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=self.nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if self.frame_splicing_factor != 1: log_features_len = self._div_ceil(log_features_len, self.frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if self.train and self._do_spectrogram_masking(): anchors, shapes = fn.external_source( source=self._cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()