def _test_vs_scipy(batch_size, num_dims, in_type, out_type): shape = (30, ) * num_dims # scipy supports only windows of size 3 and does not use smoothing window_size, smoothing_size = 3, 1 data = RandomlyShapedDataIterator(batch_size, max_shape=shape, dtype=in_type) @pipeline_def def pipeline(): if out_type == np.float32: dtype_args = {'dtype': types.FLOAT} else: dtype_args = {} input = fn.external_source(data) edges = fn.laplacian(input, window_size=window_size, smoothing_size=smoothing_size, **dtype_args) return edges, input pipe = pipeline(device_id=types.CPU_ONLY_DEVICE_ID, num_threads=4, batch_size=batch_size) pipe.build() for _ in range(test_iters): edges, input = pipe.run() edges = to_batch(edges, batch_size) input = to_batch(input, batch_size) baseline = laplacian_sp(input, out_type) max_error = 1e-6 check_batch(edges, baseline, batch_size, max_allowed_error=max_error)
def compare_eager_with_pipeline(path, batch_size=batch_size, N_iterations=5, fn_op=None, eager_op=None, **kwargs): import_path = path.split('.') if fn_op is None: fn_op = reduce(reduce_getattr, [fn] + import_path) if eager_op is None: eager_op = reduce(reduce_getattr, [eager] + import_path) pipe = single_op_pipe(fn_op, kwargs) pipe.build() for i in range(N_iterations): input_tl = tensors.TensorListCPU(np.array(get_data(i)), layout="HWC") out1, = pipe.run() out2 = eager_op(input_tl, **kwargs) out1_data = out1.as_cpu() if isinstance( out1, tensors.TensorListGPU) else out1 out2_data = out2.as_cpu() if isinstance( out2, tensors.TensorListGPU) else out2 check_batch(out1_data, out2_data, batch_size)
def check_one_hot_operator(source, device='cpu', axis=-1, expected_output_dim=None, axis_name=None, initial_layout=None): pipeline = OneHotPipeline(num_classes=num_classes, source=source, axis=axis, layout=initial_layout, axis_name=axis_name, device=device) pipeline.build() (outputs, input_batch) = pipeline.run() if device == 'gpu': input_batch = input_batch.as_cpu() input_batch = list(map(np.array, input_batch)) expected_output_dim = expected_output_dim or len(input_batch[0].shape) + 1 reference = one_hot_3_axes( input_batch, axis) if expected_output_dim == 4 else one_hot(input_batch) expected_layout = modify_layout(initial_layout, expected_output_dim, axis, axis_name) check_batch(outputs, reference, batch_size, max_allowed_error=0, expected_layout=expected_layout)
def compare_eager_with_pipeline(pipe, eager_op, *, eager_source=get_data_eager, layout='HWC', batch_size=batch_size, N_iterations=5, **kwargs): """ Compares outputs from standard pipeline `pipe` and eager operator `eager_op` across `N_iterations`. """ pipe.build() for i in range(N_iterations): input_tl = eager_source(i, layout) out_fn = pipe.run() if isinstance(input_tl, (tuple, list)): if len(input_tl): out_eager = eager_op(*input_tl, **kwargs) else: out_eager = eager_op(batch_size=batch_size, **kwargs) else: out_eager = eager_op(input_tl, **kwargs) if not isinstance(out_eager, (tuple, list)): out_eager = (out_eager,) assert len(out_fn) == len(out_eager) for tensor_out_fn, tensor_out_eager in zip(out_fn, out_eager): assert type(tensor_out_fn) == type(tensor_out_eager) if tensor_out_fn.dtype == types.BOOL: for t_fn, t_eager in zip(tensor_out_fn, tensor_out_eager): assert np.array_equal(t_fn, t_eager) else: check_batch(tensor_out_fn, tensor_out_eager, batch_size)
def check_generic_gaussian_blur( batch_size, sigma, window_size, shape, layout, axes, op_type="cpu", in_dtype=np.uint8, out_dtype=types.NO_TYPE, random_shape=True): pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0) min_shape = None if random_shape else shape data = RandomlyShapedDataIterator(batch_size, min_shape=min_shape, max_shape=shape, dtype=in_dtype) # Extract the numpy type from DALI, we can have float32 or the same as input if out_dtype == types.NO_TYPE: result_type = in_dtype elif dali_type(in_dtype) == out_dtype: result_type = in_dtype else: result_type = np.float32 with pipe: input = fn.external_source(data, layout=layout) if op_type == "gpu": input = input.gpu() blurred = fn.gaussian_blur(input, device=op_type, sigma=sigma, window_size=window_size, dtype=out_dtype) pipe.set_outputs(blurred, input) pipe.build() for _ in range(test_iters): result, input = pipe.run() if op_type == "gpu": result = result.as_cpu() input = input.as_cpu() input = to_batch(input, batch_size) skip_axes = count_skip_axes(layout) baseline = [ gaussian_baseline(img, sigma, window_size, axes, skip_axes, dtype=result_type) for img in input] max_error = 1 if result_type != np.float32 else 1e-04 check_batch(result, baseline, batch_size, max_allowed_error=max_error, expected_layout=layout)
def check_gaussian_blur(batch_size, sigma, window_size, op_type="cpu"): decoder_device = "cpu" if op_type == "cpu" else "mixed" pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0) with pipe: input, _ = fn.file_reader(file_root=images_dir, shard_id=0, num_shards=1) decoded = fn.image_decoder(input, device=decoder_device, output_type=types.RGB) blurred = fn.gaussian_blur(decoded, device=op_type, sigma=sigma, window_size=window_size) pipe.set_outputs(blurred, decoded) pipe.build() for _ in range(test_iters): result, input = pipe.run() if op_type == "gpu": result = result.as_cpu() input = input.as_cpu() input = to_batch(input, batch_size) baseline_cv = [gaussian_cv(img, sigma, window_size) for img in input] check_batch(result, baseline_cv, batch_size, max_allowed_error=1)
def _test_scalar(device, as_tensors): """Test propagation of scalars from external source""" batch_size = 4 src_pipe = Pipeline(batch_size, 1, 0) src_ext = fn.external_source( source=lambda i: [np.float32(i * 10 + i + 1) for i in range(batch_size)], device=device) src_pipe.set_outputs(src_ext) src_pipe.build() dst_pipe = Pipeline(batch_size, 1, 0, exec_async=False, exec_pipelined=False) dst_pipe.set_outputs(fn.external_source(name="ext", device=device)) dst_pipe.build() for iter in range(3): src = src_pipe.run() data = src[0] if as_tensors: data = [data[i] for i in range(len(data))] dst_pipe.feed_input("ext", data) dst = dst_pipe.run() check_batch(src[0], dst[0], batch_size, 0, 0, "")
def check_fixed_param_laplacian(device, batch_size, in_type, out_type, shape, layout, axes, window_size, smoothing_size, scales, normalize): iterator = RandomlyShapedDataIterator(batch_size, max_shape=shape, dtype=in_type) @pipeline_def def pipeline(): data = fn.external_source(iterator, layout=layout) if out_type != np.float32: dtype_arg = {} else: dtype_arg = {"dtype": types.FLOAT} if device == "gpu": data = data.gpu() edges = fn.laplacian(data, window_size=window_size, smoothing_size=smoothing_size, scale=scales, normalized_kernel=normalize, **dtype_arg) return edges, data pipe = pipeline(device_id=0, num_threads=4, batch_size=batch_size, seed=42) pipe.build() for _ in range(test_iters): edges, data = pipe.run() if device == "gpu": edges = edges.as_cpu() data = data.as_cpu() edges = to_batch(edges, batch_size) data = to_batch(data, batch_size) baseline = [] for i in range(batch_size): skip_axes = count_skip_axes(layout) window_size = np.array([]) if window_size is None else np.array( window_size, dtype=np.int32) smoothing_size = np.array( []) if smoothing_size is None else np.array(smoothing_size, dtype=np.int32) if normalize: all_sizes = get_window_sizes(window_size, smoothing_size, axes) scales = [2.**(-sum(sizes) + axes + 2) for sizes in all_sizes] scales = np.array(scales, dtype=np.float32) sample = laplacian_baseline(data[i], out_type or in_type, window_size, smoothing_size, scales, axes, skip_axes) baseline.append(sample) if out_type == np.float32: max_error = 1e-3 else: max_error = 1 check_batch(edges, baseline, batch_size, max_allowed_error=max_error, expected_layout=layout)
def _test_vs_open_cv(batch_size, window_size, in_type, out_type, normalize, grayscale): pipe = laplacian_pipe(device_id=types.CPU_ONLY_DEVICE_ID, num_threads=4, batch_size=batch_size, window_size=window_size, in_type=in_type, out_type=out_type, normalize=normalize, grayscale=grayscale) pipe.build() norm_factor = normalization_factor(window_size) scale = 1 if not normalize else norm_factor for _ in range(test_iters): edges, imgs = pipe.run() imgs = to_batch(imgs, batch_size) baseline_cv = laplacian_cv(imgs, window_size, in_type, out_type, scale, grayscale) edges = to_batch(edges, batch_size) actual_out_type = out_type if out_type is not None else in_type assert (len(edges) == len(baseline_cv)) if actual_out_type == types.FLOAT: max_error = 1e-7 if window_size <= 11 else 1e-4 else: max_error = 1 # values in the array raise exponentially with the window_size, so without normalization # the absolute error will also be big - normalize the values before the comparison if not normalize: edges = [a * norm_factor for a in edges] baseline_cv = [a * norm_factor for a in baseline_cv] check_batch(edges, baseline_cv, batch_size, max_allowed_error=max_error, expected_layout="HWC")
def _run_test_cat(num_inputs, layout, ndim, axis, axis_name): num_iter = 3 batch_size = 4 if ndim is None: ndim = len(layout) ref_axis = layout.find(axis_name) if axis_name is not None else axis if axis is not None else 0 assert ref_axis >= 0 axis_arg = None if axis_name else axis pipe = dali.pipeline.Pipeline(batch_size=batch_size, num_threads=3, device_id=0) with pipe: inputs = fn.external_source( input_generator(num_inputs, batch_size, ndim, ref_axis), num_outputs=num_inputs, layout=layout) out_cpu = fn.cat(*inputs, axis=axis_arg, axis_name=axis_name) out_gpu = fn.cat(*(x.gpu() for x in inputs), axis=axis_arg, axis_name=axis_name) pipe.set_outputs(out_cpu, out_gpu, *inputs) pipe.build() for iter in range(num_iter): o_cpu, o_gpu, *inputs = pipe.run() ref = ref_cat(inputs, ref_axis) check_batch(o_cpu, ref, batch_size, eps=0, expected_layout=layout) check_batch(o_gpu, ref, batch_size, eps=0, expected_layout=layout)
def _run_test_stack(num_inputs, layout, ndim, axis, axis_name): num_iter=3 batch_size=4 if ndim is None: ndim = len(layout) ref_axis = axis if axis is not None else 0 if axis_name: ref_layout = layout[:axis] + axis_name + layout[axis:] if layout else axis_name else: ref_layout = "" pipe = dali.pipeline.Pipeline(batch_size=batch_size, num_threads = 3, device_id = 0) with pipe: inputs = fn.external_source(input_generator(num_inputs, batch_size, ndim), num_outputs=num_inputs, layout=layout) out_cpu = fn.stack(*inputs, axis=axis, axis_name=axis_name) out_gpu = fn.stack(*(x.gpu() for x in inputs), axis=axis, axis_name=axis_name) pipe.set_outputs(out_cpu, out_gpu, *inputs); pipe.build() for _ in range(num_iter): o_cpu, o_gpu, *inputs = pipe.run() ref = ref_stack(inputs, ref_axis) check_batch(o_cpu, ref, batch_size, eps=0, expected_layout=ref_layout) check_batch(o_gpu, ref, batch_size, eps=0, expected_layout=ref_layout)
def check_per_sample_gaussian_blur(batch_size, sigma_dim, window_size_dim, shape, layout, axes, op_type="cpu"): pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0) data = RandomlyShapedDataIterator(batch_size, max_shape=shape) with pipe: if sigma_dim is not None: sigma = fn.random.uniform(range=[0.5, 3], shape=[sigma_dim]) sigma_arg = sigma else: # placeholder, so we can return something sigma = fn.random.coin_flip(probability=0) sigma_arg = None if window_size_dim is not None: window_radius = fn.random.uniform(range=[5, 10], shape=[window_size_dim]) window_size = fn.cast(window_radius, dtype=types.INT32) * 2 + 1 window_arg = window_size else: window_size = fn.random.coin_flip(probability=0) window_arg = None input = fn.external_source(data, layout=layout) if op_type == "gpu": input = input.gpu() blurred = fn.gaussian_blur(input, device=op_type, sigma=sigma_arg, window_size=window_arg) pipe.set_outputs(blurred, input, sigma, window_size) pipe.build() for _ in range(test_iters): result, input, sigma, window_size = pipe.run() if op_type == "gpu": result = result.as_cpu() input = input.as_cpu() input = to_batch(input, batch_size) sigma = to_batch(sigma, batch_size) window_size = to_batch(window_size, batch_size) baseline = [] for i in range(batch_size): sigma_arg = sigma[i] if sigma is not None else None window_arg = window_size[i] if window_size_dim is not None else None skip_axes = count_skip_axes(layout) baseline.append( gaussian_baseline(input[i], sigma_arg, window_arg, axes, skip_axes)) check_batch(result, baseline, batch_size, max_allowed_error=1, expected_layout=layout)
def check_gaussian_blur(batch_size, sigma, window_size, op_type="cpu"): pipe = get_gaussian_pipe(batch_size, sigma, window_size, op_type) pipe.build() for _ in range(test_iters): result, input = pipe.run() if op_type == "gpu": result = result.as_cpu() input = input.as_cpu() input = to_batch(input, batch_size) baseline_cv = [gaussian_cv(img, sigma, window_size) for img in input] check_batch(result, baseline_cv, batch_size, max_allowed_error=1, expected_layout="HWC")
def _test_feed_input(device): src_pipe, batch_size = build_src_pipe(device) dst_pipe = Pipeline(batch_size, 1, 0, exec_async=False, exec_pipelined=False) dst_pipe.set_outputs(fn.external_source(name="ext", device=device)) dst_pipe.build() for iter in range(3): out1 = src_pipe.run() dst_pipe.feed_input("ext", out1[0]) out2 = dst_pipe.run() check_batch(out2[0], out1[0], batch_size, 0, 0, "XY")
def _test_seq_input(num_iters, operator_fn, fixed_params, input_params, input_data: ArgData, rng): @pipeline_def def pipeline(args_data: List[ArgData]): pos_args = [ arg_data for arg_data in args_data if arg_data.desc.is_positional_arg ] pos_nodes = [None] * len(pos_args) for arg_data in pos_args: assert 0 <= arg_data.desc.name < len(pos_nodes) assert pos_nodes[arg_data.desc.name] is None pos_nodes[arg_data.desc.name] = arg_data_node(arg_data) named_args = [ arg_data for arg_data in args_data if not arg_data.desc.is_positional_arg ] arg_nodes = { arg_data.desc.name: arg_data_node(arg_data) for arg_data in named_args } output = operator_fn(*pos_nodes, **fixed_params, **arg_nodes) return output assert num_iters >= len(input_data.data) max_batch_size = max(len(batch) for batch in input_data.data) params_provider = input_params if isinstance( input_params, ParamsProviderBase) else ParamsProvider(input_params) params_provider.setup(input_data, fixed_params, rng) args_data = params_provider.compute_params() seq_pipe = pipeline(args_data=[input_data, *args_data], batch_size=max_batch_size, num_threads=4, device_id=0) unfolded_input = params_provider.unfold_input() expanded_args_data = params_provider.expand_params() max_uf_batch_size = max(len(batch) for batch in unfolded_input.data) baseline_pipe = pipeline(args_data=[unfolded_input, *expanded_args_data], batch_size=max_uf_batch_size, num_threads=4, device_id=0) seq_pipe.build() baseline_pipe.build() for _ in range(num_iters): (seq_batch, ) = seq_pipe.run() (baseline_batch, ) = baseline_pipe.run() assert params_provider.unfold_output_layout( seq_batch.layout()) == baseline_batch.layout() batch = params_provider.unfold_output(as_batch(seq_batch)) baseline_batch = as_batch(baseline_batch) assert len(batch) == len(baseline_batch) check_batch(batch, baseline_batch, len(batch))
def test_constant_promotion_mixed(): filename = os.path.join(jpeg_folder, "241", "cute-4074304_1280.jpg") file_contents = np.fromfile(filename, dtype=np.uint8) pipe = Pipeline(1, 3, 0) with pipe: jpegs, _ = fn.readers.file(files=[filename]) from_reader = fn.image_decoder(jpegs, device="mixed") from_constant = fn.image_decoder(file_contents, device="mixed") pipe.set_outputs(from_constant, from_reader) pipe.build() from_reader, from_constant = pipe.run() check_batch(from_reader, from_constant, 1)
def _testimpl_operator_noise_gaussian_vs_add_normal_dist( device, mean, stddev, variable_dist_params, batch_size, niter): pipe = pipe_gaussian_noise(mean, stddev, variable_dist_params, device=device, batch_size=batch_size, num_threads=3, device_id=0) pipe.build() for _ in range(niter): out0, out1 = pipe.run() check_batch(out0, out1, batch_size=batch_size, eps=0.1)
def check_per_sample_laplacian(device, batch_size, window_dim, smoothing_dim, normalize, shape, layout, axes, in_type, out_type): iterator = RandomlyShapedDataIterator(batch_size, max_shape=shape, dtype=in_type) pipe = laplacian_per_sample_pipeline(device_id=0, device=device, num_threads=4, batch_size=batch_size, seed=42, iterator=iterator, layout=layout, window_dim=window_dim, smoothing_dim=smoothing_dim, axes=axes, normalize=normalize, out_type=out_type) pipe.build() for _ in range(test_iters): edges, data, window_size, smoothing_size, scale = pipe.run() if device == "gpu": edges = edges.as_cpu() data = data.as_cpu() edges, data, window_size, smoothing_size, scale = [ to_batch(out, batch_size) for out in (edges, data, window_size, smoothing_size, scale) ] baseline = [] for i in range(batch_size): skip_axes = count_skip_axes(layout) sample_baseline = laplacian_baseline(data[i], out_type or in_type, window_size[i], smoothing_size[i], scale[i], axes, skip_axes) baseline.append(sample_baseline) if out_type == np.float32: # Normalized abs values are up to 2 * `axes` * 255 so it still gives # over 5 decimal digits of precision max_error = 1e-3 else: max_error = 1 check_batch(edges, baseline, batch_size, max_allowed_error=max_error, expected_layout=layout)
def run_pipeline(device, num_dim, replace=False, layout=None): @pipeline_def def pipeline(): arg = fn.external_source(input_batch(num_dim), layout=layout) if device == "gpu": arg = arg.gpu() return fn.per_frame(arg, replace=replace, device=device) pipe = pipeline(num_threads=4, batch_size=max_batch_size, device_id=0) pipe.build() expected_layout = "F" + "*" * (num_dim - 1) if layout is None else "F" + layout[1:] for baseline in input_batch(num_dim): (out,) = pipe.run() check_batch(out, baseline, len(baseline), expected_layout=expected_layout)
def check_one_hot_operator(premade_batch, axis=-1): pipeline = OneHotPipeline(num_classes=num_classes, input=premade_batch, axis=axis) pipeline.build() outputs = pipeline.run() sample_dim = len(premade_batch[0].shape) reference = one_hot_3_axes( premade_batch, axis) if sample_dim == 3 else one_hot(premade_batch) new_layout = None # TODO(klecki): add layout handling check_batch(outputs[0], reference, batch_size, max_allowed_error=0, expected_layout=new_layout)
def test_compose_change_device(): batch_size = 3 pipe = Pipeline(batch_size, 1, 0) size = fn.random.uniform(shape=2, range=(300,500)) c = ops.Compose([ ops.decoders.Image(device="cpu"), ops.Resize(size=size, device="gpu") ]) files, labels = fn.readers.caffe(path=caffe_db_folder, seed=1) pipe.set_outputs(c(files), fn.resize(fn.decoders.image(files).gpu(), size=size)) pipe.build() out = pipe.run() assert isinstance(out[0], dali.backend.TensorListGPU) test_utils.check_batch(out[0], out[1], batch_size=batch_size)
def _test_permute_batch_fixed(device): batch_size = 10 pipe = Pipeline(batch_size, 4, 0) data = fn.external_source(source=lambda: gen_data(batch_size, np.int16), device=device, layout="abc") idxs = [4, 8, 0, 6, 3, 5, 2, 9, 7, 1] pipe.set_outputs(data, fn.permute_batch(data, indices=idxs)) pipe.build() for i in range(10): orig, permuted = pipe.run() if isinstance(orig, dali.backend.TensorListGPU): orig = orig.as_cpu() ref = [orig.at(idx) for idx in idxs] check_batch(permuted, ref, len(ref), 0, 0, "abc")
def _test_callback(device, as_tensors, change_layout_to = None): src_pipe, batch_size = build_src_pipe(device) ref_pipe, batch_size = build_src_pipe(device, layout=change_layout_to) dst_pipe = Pipeline(batch_size, 1, 0) def get_from_src(): tl = src_pipe.run()[0] return [tl[i] for i in range(len(tl))] if as_tensors else tl dst_pipe.set_outputs(fn.external_source(source=get_from_src, device=device, layout=change_layout_to)) dst_pipe.build() for iter in range(3): ref = ref_pipe.run() out = dst_pipe.run() check_batch(out[0], ref[0], batch_size, 0, 0)
def check_stop_iteration_resume(pipe, batch_size, layout): pipe.build() capture_processes(pipe._py_pool) outputs_epoch_1, outputs_epoch_2 = [], [] for output in [outputs_epoch_1, outputs_epoch_2]: try: while True: (r,) = pipe.run() r = [np.copy(r.at(i)) for i in range(len(r))] output.append(r) except StopIteration: pipe.reset() assert len(outputs_epoch_1) == len(outputs_epoch_2), ( "Epochs must have same number of iterations, " "but they have {} {} respectively".format(len(outputs_epoch_1), len(outputs_epoch_2))) for out_1, out_2 in zip(outputs_epoch_1, outputs_epoch_2): check_batch(out_1, out_2, batch_size, 0, None, expected_layout=layout, compare_layouts=True)
def _test_permute_batch(device, type): batch_size = 10 pipe = Pipeline(batch_size, 4, 0) data = fn.external_source(source=lambda: gen_data(batch_size, type), device=device, layout="abc") perm = fn.batch_permutation() pipe.set_outputs(data, fn.permute_batch(data, indices=perm), perm) pipe.build() for i in range(10): orig, permuted, idxs = pipe.run() idxs = [int(idxs.at(i)) for i in range(batch_size)] if isinstance(orig, dali.backend.TensorListGPU): orig = orig.as_cpu() ref = [orig.at(idx) for idx in idxs] check_batch(permuted, ref, len(ref), 0, 0, "abc")
def check_reader(op_path, *, fn_op=None, eager_op=None, batch_size=batch_size, N_iterations=2, **kwargs): fn_op, eager_op = get_ops(op_path, fn_op, eager_op) pipe = reader_pipeline(fn_op, kwargs) pipe.build() iter_eager = eager_op(batch_size=batch_size, **kwargs) for _ in range(N_iterations): for i, out_eager in enumerate(iter_eager): out_fn = pipe.run() if not isinstance(out_eager, (tuple, list)): out_eager = (out_eager,) assert len(out_fn) == len(out_eager) for tensor_out_fn, tensor_out_eager in zip(out_fn, out_eager): if i == len(iter_eager) - 1: tensor_out_fn = _slice_tensorlist(tensor_out_fn, len(tensor_out_eager)) assert type(tensor_out_fn) == type(tensor_out_eager) check_batch(tensor_out_fn, tensor_out_eager, len(tensor_out_eager))
def _test_resize(layout, interp, dtype, w, h): channel_first = (layout == "FCHW") pipe_dali = create_dali_pipe(channel_first, 8, interp, dtype, w, h) pipe_dali.build() pipe_ref = create_ref_pipe(channel_first, 8, interp, dtype, w, h) pipe_ref.build() eps = 1e-2 max_err = 6 for iter in range(4): out_dali = pipe_dali.run() out_ref = pipe_ref.run()[0] dali_cpu = out_dali[0] dali_gpu = out_dali[1] if interp == types.INTERP_LANCZOS3: # PIL can't resize float data. Lanczos resamling generates overshoot which we have # to get rid of for the comparison to succeed. dali_cpu = [np.array(x).clip(0, 255) for x in dali_cpu] dali_gpu = [np.array(x).clip(0, 255) for x in dali_gpu.as_cpu()] else: dali_cpu = [np.array(x) for x in dali_cpu] dali_gpu = [np.array(x) for x in dali_gpu.as_cpu()] if channel_first: out_ref = [np.array(x)[:, :, 1:-1, 1:-1] for x in out_ref] dali_gpu = [x[:, :, 1:-1, 1:-1] for x in dali_gpu] dali_cpu = [x[:, :, 1:-1, 1:-1] for x in dali_cpu] else: out_ref = [np.array(x)[:, 1:-1, 1:-1, :] for x in out_ref] dali_gpu = [x[:, 1:-1, 1:-1, :] for x in dali_gpu] dali_cpu = [x[:, 1:-1, 1:-1, :] for x in dali_cpu] check_batch(dali_cpu, out_ref, 2, eps=eps, max_allowed_error=max_err) check_batch(dali_gpu, out_ref, 2, eps=eps, max_allowed_error=max_err) ext_size = out_dali[2] size_cpu = out_dali[3] size_gpu = out_dali[4] check_batch(ext_size, size_cpu, 2) check_batch(ext_size, size_gpu, 2)
def _test_standalone_vs_fused(device): pipe = audio_decoder_pipe(device=device, batch_size=2, num_threads=1, device_id=0) pipe.build() is_gpu = device == 'gpu' for _ in range(2): outs = pipe.run() # two sampling rates - should be bit-exact check_batch(outs[0], outs[1], eps=1e-6 if is_gpu else 0, max_allowed_error=1e-4 if is_gpu else 0) # numerical round-off error in rate check_batch(outs[0], outs[2], eps=1e-6, max_allowed_error=1e-4) # here, the sampling rate is slightly different, so we can tolerate larger errors check_batch(outs[0], outs[3], eps=1e-4, max_allowed_error=1)
def _test_kernels(device, num_dims, smoothing, normalize): batch_size = (max_window_size + 2 - min_window_size) // 2 def get_inputs(): ones = [] window_sizes = [] smoothing_sizes = [] scales = [] padding = 2 for win_size in range(min_window_size, max_window_size + 2, 2): a_size = win_size + padding a = np.zeros((a_size, ) * num_dims, dtype=np.float32) a[(a_size // 2, ) * num_dims] = 1 ones.append(a) window_sizes.append(np.array(win_size, dtype=np.int32)) if smoothing: smoothing_sizes.append(np.array(win_size, dtype=np.int32)) exponent = num_dims * win_size - 2 - num_dims else: smoothing_sizes.append(np.array(1, dtype=np.int32)) exponent = win_size - 3 scales.append(np.array(2.**(-exponent), dtype=np.float32)) return ones, window_sizes, smoothing_sizes, scales @pipeline_def def pipeline(): ones, window_sizes, smoothing_sizes, scales = fn.external_source( get_inputs, num_outputs=4) if device == "gpu": ones = ones.gpu() kernels = fn.laplacian(ones, window_size=window_sizes, smoothing_size=smoothing_sizes, dtype=types.FLOAT, normalized_kernel=normalize, device=device) return kernels, scales def outer(*vs): acc = np.array([1.]) for v in vs: acc = np.outer(acc, v) return acc.reshape(tuple(len(v) for v in vs)) def get_cv2_kernel(win_size, smoothing): d, s = cv2.getDerivKernels(2, 0, win_size) if not smoothing: s = np.zeros(win_size) s[win_size // 2] = 1. windows = [[d if i == j else s for j in range(num_dims)] for i in range(num_dims)] return sum(outer(*ws) for ws in windows) pipe = pipeline(num_threads=4, batch_size=batch_size, device_id=0) pipe.build() (kernels, scales) = pipe.run() if device == "gpu": kernels = kernels.as_cpu() kernels = [np.array(ker)[(slice(1, -1), ) * num_dims] for ker in kernels] scales = [np.array(sf).item() for sf in scales] win_sizes = range(min_window_size, max_window_size + 2, 2) assert (len(kernels) == len(win_sizes) == len(scales)) baseline_kernels = [ get_cv2_kernel(win_size, smoothing) * scale for win_size, scale in zip(win_sizes, scales) ] if not normalize: # output was not normalized by the op kernels = [kernel * scale for kernel, scale in zip(kernels, scales)] check_batch(kernels, baseline_kernels, batch_size, max_allowed_error=1e-5, expected_layout="HWC")
def _test_stitching(device, dim, channel_first, dtype, interp): batch_size = 1 if dim == 3 else 10 pipe = dali.pipeline.Pipeline(batch_size=batch_size, num_threads=1, device_id=0, seed=1234, prefetch_queue_depth=1) with pipe: if dim == 2: files, labels = dali.fn.readers.caffe(path=db_2d_folder, random_shuffle=True) images_cpu = dali.fn.decoders.image(files, device="cpu") else: images_cpu = dali.fn.external_source( source=random_3d_loader(batch_size), layout="DHWC") images_hwc = images_cpu if device == "cpu" else images_cpu.gpu() if channel_first: images = dali.fn.transpose( images_hwc, perm=[3, 0, 1, 2] if dim == 3 else [2, 0, 1], transpose_layout=True) else: images = images_hwc out_size_full = [32, 32, 32] if dim == 3 else [160, 160] out_size_half = [x // 2 for x in out_size_full] roi_start = [0] * dim roi_end = [1] * dim resized = fn.resize(images, dtype=dtype, min_filter=interp, mag_filter=interp, size=out_size_full) outputs = [resized] for z in range(dim - 1): if dim == 3: roi_start[0] = z * 0.5 roi_end[0] = (z + 1) * 0.5 for y in [0, 1]: roi_start[-2] = y * 0.5 roi_end[-2] = (y + 1) * 0.5 for x in [0, 1]: roi_start[-1] = x * 0.5 roi_end[-1] = (x + 1) * 0.5 part = fn.resize(images, dtype=dtype, interp_type=interp, size=out_size_half, roi_start=roi_start, roi_end=roi_end, roi_relative=True) outputs.append(part) pipe.set_outputs(*outputs) pipe.build() for iter in range(1): out = pipe.run() if device == "gpu": out = [x.as_cpu() for x in out] whole = out[0] tiled = [] for i in range(batch_size): slices = [] for z in range(dim - 1): q00 = out[1 + z * 4 + 0].at(i) q01 = out[1 + z * 4 + 1].at(i) q10 = out[1 + z * 4 + 2].at(i) q11 = out[1 + z * 4 + 3].at(i) if channel_first: slices.append(np.block([[q00, q01], [q10, q11]])) else: slices.append(np.block([[[q00], [q01]], [[q10], [q11]]])) if dim == 3: if channel_first: tiled.append(np.block([[[slices[0]]], [[slices[1]]]])) else: tiled.append(np.block([[[[slices[0]]]], [[[slices[1]]]]])) else: tiled.append(slices[0]) max_err = 1e-3 if type == types.FLOAT else 1 check_batch(tiled, whole, batch_size, 1e-4, max_err, compare_layouts=False)