Exemplo n.º 1
0
def run_decode(data_path, out_type):
    batch_size = 4
    pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0)
    input, _ = fn.file_reader(file_root=data_path,
                              shard_id=0,
                              num_shards=1,
                              name="reader")
    decoded = fn.image_decoder(input, output_type=types.RGB)
    decoded_shape = fn.shapes(decoded)
    raw_shape = fn.peek_image_shape(input, type=out_type)
    pipe.set_outputs(decoded, decoded_shape, raw_shape)
    pipe.build()
    samples = 0
    length = pipe.reader_meta(name="reader")['epoch_size']
    while samples < length:
        samples += batch_size
        (images, decoded_shape, raw_shape) = pipe.run()
        for i in range(batch_size):
            # as we are asking for a particular color space it may differ from the source image, so don't compare it
            image = images.at(i)
            shape_type = dali_types_to_np(out_type)
            for d in range(len(image.shape) - 1):
                assert image.shape[d] == decoded_shape.at(
                    i)[d], "{} vs {}".format(image.shape[d],
                                             decoded_shape.at(i)[d])
                assert image.shape[d] == raw_shape.at(i)[d], "{} vs {}".format(
                    image.shape[d],
                    raw_shape.at(i)[d])
                assert raw_shape.at(
                    i)[d].dtype == shape_type, "{} vs {}".format(
                        raw_shape.at(i)[d].dtyp, shape_type)
def _test_rrc(device, max_frames, layout, aspect_ratio_range, area_range,
              output_size, input_type, output_type):
    batch_size = 4
    pipe = dali.pipeline.Pipeline(batch_size, 4, 0)
    channel_dim = layout.find('C')
    value_range = type_range(test_utils.dali_type_to_np(input_type))
    if channel_dim == len(layout) - 1:
        channel_dim = -1
    input = fn.external_source(source=generator(batch_size, max_frames,
                                                channel_dim, input_type),
                               layout=layout)
    shape = fn.shapes(input)
    if device == "gpu":
        input = input.gpu()
    out = fn.random_resized_crop(input,
                                 random_aspect_ratio=aspect_ratio_range,
                                 random_area=area_range,
                                 size=output_size,
                                 interp_type=dali.types.INTERP_LINEAR,
                                 seed=12321,
                                 dtype=output_type)
    pipe.set_outputs(out, shape)
    pipe.build()
    for iter in range(3):
        outputs, input_shapes = pipe.run()
        if device == "gpu":
            outputs = outputs.as_cpu()
        assert outputs.layout() == layout
        for i in range(batch_size):
            out = outputs.at(i)
            input_shape = input_shapes.at(i).tolist()
            check_output(out, channel_dim, input_shape, aspect_ratio_range,
                         area_range, value_range)
Exemplo n.º 3
0
 def crop_fn(self, img, lbl):
     center = fn.segmentation.random_mask_pixel(lbl, foreground=fn.coin_flip(probability=self.oversampling))
     crop_anchor = self.slice_fn(center, 1, self.dim) - self.crop_shape // 2
     adjusted_anchor = math.max(0, crop_anchor)
     max_anchor = self.slice_fn(fn.shapes(lbl), 1, self.dim) - self.crop_shape
     crop_anchor = math.min(adjusted_anchor, max_anchor)
     img = fn.slice(img.gpu(), crop_anchor, self.crop_shape, axis_names=self.axis_name, out_of_bounds_policy="pad")
     lbl = fn.slice(lbl.gpu(), crop_anchor, self.crop_shape, axis_names=self.axis_name, out_of_bounds_policy="pad")
     return img, lbl
Exemplo n.º 4
0
 def crop_fn(self, img, lbl):
     center = fn.segmentation.random_mask_pixel(lbl, foreground=fn.coin_flip(probability=self.oversampling, **self.aug_seed_kwargs),
                                                **self.aug_seed_kwargs)
     crop_anchor = self.slice_fn(center) - self.crop_shape // 2
     adjusted_anchor = math.max(0, crop_anchor)
     max_anchor = self.slice_fn(fn.shapes(lbl)) - self.crop_shape
     crop_anchor = math.min(adjusted_anchor, max_anchor)
     img = fn.slice(img, crop_anchor, self.crop_shape, axis_names="DHW", out_of_bounds_policy="pad")
     lbl = fn.slice(lbl, crop_anchor, self.crop_shape, axis_names="DHW", out_of_bounds_policy="pad")
     return img, lbl
Exemplo n.º 5
0
def audio_decoder_pipe(device):
    encoded, _ = fn.readers.file(files=names)
    audio0, sr0 = fn.decoders.audio(encoded, dtype=types.FLOAT)
    out_sr = 15000
    audio1, sr1 = fn.decoders.audio(encoded,
                                    dtype=types.FLOAT,
                                    sample_rate=out_sr)
    if device == 'gpu':
        audio0 = audio0.gpu()
    audio2 = fn.experimental.audio_resample(audio0,
                                            in_rate=sr0,
                                            out_rate=out_sr)
    audio3 = fn.experimental.audio_resample(audio0, scale=out_sr / sr0)
    audio4 = fn.experimental.audio_resample(audio0,
                                            out_length=fn.shapes(audio1)[0])
    return audio1, audio2, audio3, audio4
Exemplo n.º 6
0
def create_dali_pipe(channel_first, seq_len, interp, dtype, w, h, batch_size=2):
    pipe = dali.pipeline.Pipeline(batch_size, 1, 0, 0)
    with pipe:
        layout = "FCHW" if channel_first else "FHWC"
        ext = fn.external_source(GetSequences(channel_first, seq_len, batch_size), layout=layout)
        resize_cpu_out = fn.resize(ext, resize_x=w, resize_y=h, interp_type=interp,
                                   dtype=dtype, save_attrs=True)
        resize_gpu_out = fn.resize(ext.gpu(), resize_x=w, resize_y=h, interp_type=interp,
                                   minibatch_size=4, dtype=dtype, save_attrs=True)
        dali_resized_cpu, size_cpu = resize_cpu_out
        dali_resized_gpu, size_gpu = resize_gpu_out
        # extract just HW part from the input shape
        ext_size = fn.slice(fn.cast(fn.shapes(ext), dtype=types.INT32),
                            2 if channel_first else 1, 2, axes=[0])
        pipe.set_outputs(dali_resized_cpu, dali_resized_gpu, ext_size, size_cpu, size_gpu)
    return pipe
Exemplo n.º 7
0
def create_dali_pipe(channel_first,
                     seq_len,
                     interp,
                     dtype,
                     w,
                     h,
                     batch_size=2):
    pipe = dali.pipeline.Pipeline(batch_size, 1, 0, 0)
    with pipe:
        layout = "FCHW" if channel_first else "FHWC"
        ext = fn.external_source(GetSequences(channel_first, seq_len,
                                              batch_size),
                                 layout=layout)
        resize_cpu_out = fn.resize(ext,
                                   resize_x=w,
                                   resize_y=h,
                                   interp_type=interp,
                                   dtype=dtype,
                                   save_attrs=True)
        resize_gpu_out = fn.resize(ext.gpu(),
                                   resize_x=w,
                                   resize_y=h,
                                   interp_type=interp,
                                   minibatch_size=4,
                                   dtype=dtype,
                                   save_attrs=True)
        dali_resized_cpu, size_cpu = resize_cpu_out
        dali_resized_gpu, size_gpu = resize_gpu_out
        # extract just HW part from the input shape
        shape_anchor = np.array([2 if channel_first else 1], dtype=np.float32)
        shape_shape = np.array([2], dtype=np.float32)
        ext_size = fn.slice(fn.cast(fn.shapes(ext), dtype=types.INT32),
                            types.Constant(shape_anchor, device="cpu"),
                            types.Constant(shape_shape, device="cpu"),
                            normalized_anchor=False,
                            normalized_shape=False,
                            axes=[0])
        pipe.set_outputs(dali_resized_cpu, dali_resized_gpu, ext_size,
                         size_cpu, size_gpu)
    return pipe
Exemplo n.º 8
0
def check_pad_to_square(device='cpu', batch_size=3, ndim=2, num_iter=3):
    pipe = Pipeline(batch_size=batch_size,
                    num_threads=3,
                    device_id=0,
                    seed=1234)
    axes = (0, 1)
    with pipe:
        in_shape = fn.cast(fn.random.uniform(range=(10, 20), shape=(ndim, )),
                           dtype=types.INT32)
        in_data = fn.reshape(fn.random.uniform(range=(0., 1.), shape=in_shape),
                             layout="HW")
        shape = fn.shapes(in_data, dtype=types.INT32)
        h = fn.slice(shape, 0, 1, axes=[0])
        w = fn.slice(shape, 1, 1, axes=[0])
        side = math.max(h, w)
        if device == 'gpu':
            in_data = in_data.gpu()
        out_data = fn.pad(in_data,
                          axis_names="HW",
                          shape=fn.cat(side, side, axis=0))
        pipe.set_outputs(in_data, out_data)
    pipe.build()
    for _ in range(num_iter):
        outs = [
            out.as_cpu() if isinstance(out, TensorListGPU) else out
            for out in pipe.run()
        ]
        for i in range(batch_size):
            in_data, out_data = \
                [outs[out_idx].at(i) for out_idx in range(len(outs))]
            in_shape = in_data.shape
            max_side = max(in_shape)
            for s in out_data.shape:
                assert s == max_side
            np.testing.assert_equal(out_data[:in_shape[0], :in_shape[1]],
                                    in_data)
            np.testing.assert_equal(out_data[in_shape[0]:, :], 0)
            np.testing.assert_equal(out_data[:, in_shape[1]:], 0)
Exemplo n.º 9
0
def rnnt_train_pipe(files,
                    sample_rate,
                    pad_amount=0,
                    preemph_coeff=.97,
                    window_size=.02,
                    window_stride=.01,
                    window="hann",
                    nfeatures=64,
                    nfft=512,
                    frame_splicing_stack=1,
                    frame_splicing_subsample=1,
                    lowfreq=0.0,
                    highfreq=None,
                    normalize_type='per_feature',
                    speed_perturb=False,
                    silence_trim=False,
                    device='cpu'):
    assert normalize_type == 'per_feature' or normalize_type == 'all_features'
    norm_axes = [1] if normalize_type == 'per_feature' else [0, 1]
    win_len, win_hop = win_args(sample_rate, window_size, window_stride)
    window_fn = torch_windows.get(window, None)
    window_fn_arg = window_fn(
        win_len, periodic=False).numpy().tolist() if window_fn else None

    data, _ = fn.readers.file(files=files,
                              device="cpu",
                              random_shuffle=False,
                              shard_id=0,
                              num_shards=1)
    audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True)

    # splicing with subsampling doesn't work if audio_len is a GPU data node
    if device == 'gpu' and frame_splicing_subsample == 1:
        audio = audio.gpu()

    # Speed perturbation 0.85x - 1.15x
    if speed_perturb:
        target_sr_factor = fn.random.uniform(device="cpu",
                                             range=(1 / 1.15, 1 / 0.85))
        audio = fn.experimental.audio_resample(audio, scale=target_sr_factor)

    # Silence trimming
    if silence_trim:
        begin, length = fn.nonsilent_region(audio, cutoff_db=-80)
        audio = fn.slice(audio, begin, length, axes=[0])

    audio_shape = fn.shapes(audio, dtype=types.INT32)
    orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, ))

    # If we couldn't move to GPU earlier, do it now
    if device == 'gpu' and frame_splicing_subsample > 1:
        audio = audio.gpu()

    if pad_amount > 0:
        audio_len = orig_audio_len + 2 * pad_amount
        padded_audio = dali_reflect_pad_graph(audio, orig_audio_len,
                                              pad_amount)
    else:
        audio_len = orig_audio_len
        padded_audio = audio

    # Preemphasis filter
    preemph_audio = fn.preemphasis_filter(padded_audio,
                                          preemph_coeff=preemph_coeff,
                                          border='zero')

    # Spectrogram
    spec_len = audio_len // win_hop + 1
    spec = fn.spectrogram(preemph_audio,
                          nfft=nfft,
                          window_fn=window_fn_arg,
                          window_length=win_len,
                          window_step=win_hop,
                          center_windows=True,
                          reflect_padding=True)
    # Mel spectrogram
    mel_spec = fn.mel_filter_bank(spec,
                                  sample_rate=sample_rate,
                                  nfilter=nfeatures,
                                  freq_low=lowfreq,
                                  freq_high=highfreq)

    # Log
    log_features = fn.to_decibels(mel_spec + 1e-20,
                                  multiplier=np.log(10),
                                  reference=1.0,
                                  cutoff_db=-80)

    # Frame splicing
    if frame_splicing_stack > 1 or frame_splicing_subsample > 1:
        log_features_spliced = dali_frame_splicing_graph(
            log_features,
            nfeatures,
            spec_len,
            stacking=frame_splicing_stack,
            subsampling=frame_splicing_subsample)
    else:
        log_features_spliced = log_features

    # Normalization
    if normalize_type:
        norm_log_features = fn.normalize(log_features_spliced,
                                         axes=norm_axes,
                                         device=device,
                                         epsilon=4e-5,
                                         ddof=1)
    else:
        norm_log_features = log_features_spliced

    return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
Exemplo n.º 10
0
def check_normal_distribution(device,
                              dtype,
                              shape=None,
                              use_shape_like_input=False,
                              variable_shape=False,
                              mean=0.0,
                              stddev=1.0,
                              variable_dist_params=False,
                              shape_gen_f=None,
                              niter=3,
                              batch_size=3,
                              device_id=0,
                              num_threads=3):
    pipe = Pipeline(batch_size=batch_size,
                    device_id=device_id,
                    num_threads=num_threads,
                    seed=123456)
    with pipe:
        shape_like_in = None
        shape_arg = None
        assert shape is None or shape_gen_f is None
        if variable_shape:
            if shape_gen_f is None:

                def shape_gen_f():
                    return random_shape(shape)

            if use_shape_like_input:
                shape_like_in = fn.external_source(
                    lambda: np.zeros(shape_gen_f()),
                    device=device,
                    batch=False)
                shape_out = fn.shapes(shape_like_in)
            else:
                shape_arg = fn.external_source(shape_gen_f, batch=False)
                shape_out = shape_arg
        else:
            if use_shape_like_input:
                shape_like_in = np.zeros(shape)
            else:
                shape_arg = shape
            # Can't make an empty list constant
            shape_out = types.Constant(shape if shape is not None and shape !=
                                       () else (1, ),
                                       dtype=types.INT32)

        mean_arg = None
        stddev_arg = None
        if variable_dist_params:
            mean_arg = fn.external_source(lambda: np.array(
                np.random.uniform(low=-100.0, high=100.0), dtype=np.float32),
                                          device='cpu',
                                          batch=False)
            stddev_arg = fn.external_source(lambda: np.array(
                np.random.uniform(low=1.0, high=100.0), dtype=np.float32),
                                            device='cpu',
                                            batch=False)
        else:
            mean_arg = mean
            stddev_arg = stddev
        inputs = [shape_like_in] if shape_like_in is not None else []
        out = fn.random.normal(*inputs,
                               device=device,
                               shape=shape_arg,
                               mean=mean_arg,
                               stddev=stddev_arg,
                               dtype=dtype)
        pipe.set_outputs(out, shape_out, mean_arg, stddev_arg)
    pipe.build()
    for i in range(niter):
        outputs = pipe.run()
        out, shapes, means, stddevs = tuple(outputs[i].as_cpu(
        ) if isinstance(outputs[i], TensorListGPU) else outputs[i]
                                            for i in range(len(outputs)))
        for sample_idx in range(batch_size):
            sample = np.array(out[sample_idx])
            if sample.shape == ():
                continue
            sample_shape = np.array(shapes[sample_idx])
            mean = np.array(means[sample_idx])
            stddev = np.array(stddevs[sample_idx])
            assert (sample.shape == sample_shape
                    ).all(), f"{sample.shape} != {sample_shape}"

            data = sample.flatten()
            data_len = len(data)

            # Checking sanity of the data
            if data_len >= 100 and dtype in [types.FLOAT, types.FLOAT64]:
                # Empirical rule:
                # ~68% of the observations within one standard deviation
                # ~95% of the observations within one standard deviation
                # ~99.7% of the observations within one standard deviation
                within_1stddevs = np.where((data > (mean - 1 * stddev))
                                           & (data < (mean + 1 * stddev)))
                p1 = len(within_1stddevs[0]) / data_len
                within_2stddevs = np.where((data > (mean - 2 * stddev))
                                           & (data < (mean + 2 * stddev)))
                p2 = len(within_2stddevs[0]) / data_len
                within_3stddevs = np.where((data > (mean - 3 * stddev))
                                           & (data < (mean + 3 * stddev)))
                p3 = len(within_3stddevs[0]) / data_len
                assert p3 > 0.9, f"{p3}"  # leave some room
                assert p2 > 0.8, f"{p2}"  # leave some room
                assert p1 > 0.5, f"{p1}"  # leave some room

                # It's not 100% mathematically correct, but makes do in case of this test
                _, pvalues_anderson, _ = st.anderson(data, dist='norm')
                assert pvalues_anderson[2] > 0.5
Exemplo n.º 11
0
def build_pipes(device, dim, batch_size, channel_first, mode, interp, dtype,
                w_input, h_input, d_input, use_size_arg, use_size_input,
                use_roi):
    dali_pipe = Pipeline(batch_size=batch_size,
                         num_threads=8,
                         device_id=0,
                         seed=1234)
    with dali_pipe:
        if dim == 2:
            files, labels = dali.fn.readers.caffe(path=db_2d_folder,
                                                  random_shuffle=True)
            images_cpu = dali.fn.decoders.image(files, device="cpu")
        else:
            images_cpu = dali.fn.external_source(
                source=random_3d_loader(batch_size), layout="DHWC")

        images_hwc = images_cpu if device == "cpu" else images_cpu.gpu()

        if channel_first:
            images = dali.fn.transpose(
                images_hwc,
                perm=[3, 0, 1, 2] if dim == 3 else [2, 0, 1],
                transpose_layout=True)
        else:
            images = images_hwc

        roi_start = None
        roi_end = None
        w = None
        h = None
        d = None
        size = None

        minibatch_size = 2 if dim == 3 else 8

        if use_roi:
            # Calculate absolute RoI
            in_size = fn.slice(fn.shapes(images_cpu),
                               types.Constant(0,
                                              dtype=types.FLOAT,
                                              device="cpu"),
                               types.Constant(dim,
                                              dtype=types.FLOAT,
                                              device="cpu"),
                               axes=[0],
                               normalized_shape=False)
            roi_start = fn.random.uniform(range=(0, 0.4), shape=[dim
                                                                 ]) * in_size
            roi_end = fn.random.uniform(range=(0.6, 1.0), shape=[dim
                                                                 ]) * in_size

        size_range = (10, 200) if dim == 3 else (10, 1000)

        if use_size_arg:
            if use_size_input:
                mask = fn.cast(fn.random.uniform(range=(0.8, 1.9),
                                                 shape=[dim]),
                               dtype=types.INT32)
                size = fn.random.uniform(range=size_range, shape=[dim]) * mask
            else:
                size = [300, 400] if dim == 2 else [80, 100, 120]

            resized = resize_dali(images,
                                  channel_first,
                                  dtype,
                                  interp,
                                  mode,
                                  size,
                                  None,
                                  None,
                                  None,
                                  roi_start,
                                  roi_end,
                                  minibatch_size=minibatch_size,
                                  max_size=max_size(dim))
        else:
            if w_input:
                has_w = fn.random.coin_flip(probability=0.8)
                w = fn.random.uniform(range=size_range) * has_w
            else:
                w = 320  # some fixed value

            if h_input:
                has_h = fn.random.coin_flip(probability=0.8)
                h = fn.random.uniform(range=size_range) * has_h
            else:
                h = 240  # some other fixed value

            if dim >= 3:
                if d_input:
                    has_d = fn.random.coin_flip(probability=0.8)
                    d = fn.random.uniform(range=size_range) * has_d
                else:
                    d = 31  # some other fixed value

            resized = resize_dali(images,
                                  channel_first,
                                  dtype,
                                  interp,
                                  mode,
                                  None,
                                  w,
                                  h,
                                  d,
                                  roi_start,
                                  roi_end,
                                  minibatch_size=minibatch_size,
                                  max_size=max_size(dim))

        outputs = [images, resized]
        if roi_start is not None and roi_end is not None:
            outputs += [roi_start, roi_end]

        for x in (d, h, w, size):
            if x is not None:
                if isinstance(x, _DataNode):
                    outputs.append(x)
                else:
                    outputs.append(
                        types.Constant(np.array(x, dtype=np.float32)))

        dali_pipe.set_outputs(*outputs)

    pil_pipe = Pipeline(batch_size=batch_size,
                        num_threads=8,
                        device_id=0,
                        exec_async=False,
                        exec_pipelined=False)
    with pil_pipe:
        images = fn.external_source(name="images",
                                    layout=layout_str(dim, channel_first))
        sizes = fn.external_source(name="size")
        roi_start = fn.external_source(name="roi_start")
        roi_end = fn.external_source(name="roi_end")
        resized = resize_PIL(dim, channel_first, dtype, interp, images, sizes,
                             roi_start, roi_end)
        resized = fn.reshape(resized, layout=layout_str(dim, channel_first))
        pil_pipe.set_outputs(resized)
    dali_pipe.build()
    pil_pipe.build()

    return dali_pipe, pil_pipe
Exemplo n.º 12
0
    def define_graph(self):
        inputs, bboxes, labels, polygons, vertices = fn.readers.coco(
            file_root=self.file_root,
            annotations_file=self.annotation_file,
            skip_empty=True,
            shard_id=self.share_id,
            num_shards=self.num_gpus,
            ratio=True,
            ltrb=True,
            polygon_masks=True,
            random_shuffle=self.random_shuffle,
            shuffle_after_epoch=self.shuffle_after_epoch,
            name="Reader")

        input_shape = fn.slice(fn.cast(fn.peek_image_shape(inputs),
                                       dtype=types.INT32),
                               0,
                               2,
                               axes=[0])
        h = fn.slice(input_shape, 0, 1, axes=[0], dtype=types.FLOAT)
        w = fn.slice(input_shape, 1, 1, axes=[0], dtype=types.FLOAT)
        short_side = math.min(w, h)
        scale = fn.random.uniform(range=[0.3, 1.])
        crop_side = fn.cast(math.ceil(scale * short_side), dtype=types.INT32)
        crop_shape = fn.cat(crop_side, crop_side)
        anchor_rel, shape_rel, bboxes, labels, bbox_indices = fn.random_bbox_crop(
            bboxes,
            labels,
            input_shape=input_shape,
            crop_shape=crop_shape,
            shape_layout="HW",
            thresholds=[
                0.
            ],  # No minimum intersection-over-union, for demo purposes
            allow_no_crop=False,  # No-crop is disallowed, for demo purposes 
            seed=-1,  # Fixed random seed for deterministic results
            bbox_layout="xyXY",  # left, top, right, back
            output_bbox_indices=
            True,  # Output indices of the filtered bounding boxes
            total_num_attempts=1024,
        )
        polygons, vertices = fn.segmentation.select_masks(
            bbox_indices, polygons, vertices)
        images = fn.decoders.image_slice(inputs,
                                         anchor_rel,
                                         shape_rel,
                                         normalized_anchor=False,
                                         normalized_shape=False,
                                         device='mixed')
        images = fn.color_space_conversion(images,
                                           image_type=types.RGB,
                                           output_type=types.BGR)
        MT_1_vertices = fn.transforms.crop(to_start=(0.0, 0.0),
                                           to_end=fn.cat(w, h))
        MT_2_vertices = fn.transforms.crop(from_start=anchor_rel,
                                           from_end=(anchor_rel + shape_rel),
                                           to_start=(0.0, 0.0),
                                           to_end=(1., 1.))
        vertices = fn.coord_transform(fn.coord_transform(vertices,
                                                         MT=MT_1_vertices),
                                      MT=MT_2_vertices)
        box_like_shape = fn.cat(
            fn.slice(fn.shapes(bboxes, dtype=types.INT32), 0, 1, axes=[0]), -1)
        targets = fn.cat(bboxes,
                         fn.reshape(vertices, shape=box_like_shape),
                         axis=1)

        interp_methods = [
            types.INTERP_LINEAR, types.INTERP_CUBIC, types.INTERP_LANCZOS3,
            types.INTERP_GAUSSIAN, types.INTERP_NN, types.INTERP_TRIANGULAR
        ]
        interp_method = fn.random.uniform(
            values=[int(x) for x in interp_methods], dtype=types.INT32)
        interp_method = fn.reinterpret(interp_method, dtype=types.INTERP_TYPE)
        images = fn.resize(images,
                           dtype=types.FLOAT,
                           size=self.input_dim,
                           interp_type=interp_method)

        labels = labels.gpu()
        targets = targets.gpu()
        return (images, targets, labels)
Exemplo n.º 13
0
def dali_asr_pipeline(train_pipeline,  # True if training, False if validation
                      file_root,
                      file_list,
                      sample_rate,
                      silence_threshold,
                      resample_range,
                      discrete_resample_range,
                      window_size,
                      window_stride,
                      nfeatures,
                      nfft,
                      frame_splicing_factor,
                      dither_coeff,
                      pad_align,
                      preemph_coeff,
                      do_spectrogram_masking=False,
                      cutouts_generator=None,
                      shard_id=0,
                      n_shards=1,
                      preprocessing_device="gpu"):
    do_remove_silence = silence_threshold is not None

    def _div_ceil(dividend, divisor):
        return (dividend + (divisor - 1)) // divisor

    encoded, label = fn.readers.file(
        device="cpu", name="file_reader", file_root=file_root,
        file_list=file_list, shard_id=shard_id, num_shards=n_shards,
        shuffle_after_epoch=train_pipeline)

    speed_perturbation_coeffs = None
    if resample_range is not None:
        if discrete_resample_range:
            values = [resample_range[0], 1.0, resample_range[1]]
            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
                                                          values=values)
        else:
            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
                                                          range=resample_range)

    if train_pipeline and speed_perturbation_coeffs is not None:
        dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate
    elif resample_range is None:
        dec_sample_rate_arg = sample_rate
    else:
        dec_sample_rate_arg = None

    audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg,
                                 dtype=types.FLOAT, downmix=True)
    if do_remove_silence:
        begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold)
        audio = fn.slice(audio, begin, length, axes=[0])

    # Max duration drop is performed at DataLayer stage

    if preprocessing_device == "gpu":
        audio = audio.gpu()

    if dither_coeff != 0.:
        audio = audio + fn.random.normal(audio) * dither_coeff

    audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)

    spec = fn.spectrogram(audio, nfft=nfft,
                          window_length=window_size * sample_rate,
                          window_step=window_stride * sample_rate)

    mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate,
                                  nfilter=nfeatures, normalize=True)

    log_features = fn.to_decibels(mel_spec, multiplier=np.log(10),
                                  reference=1.0, cutoff_db=math.log(1e-20))

    log_features_len = fn.shapes(log_features)
    if frame_splicing_factor != 1:
        log_features_len = _div_ceil(log_features_len, frame_splicing_factor)

    log_features = fn.normalize(log_features, axes=[1])
    log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align)

    if train_pipeline and do_spectrogram_masking:
        anchors, shapes = fn.external_source(source=cutouts_generator,
                                             num_outputs=2, cycle=True)
        log_features = fn.erase(log_features, anchor=anchors, shape=shapes,
                                axes=[0, 1], fill_value=0,
                                normalized_anchor=True)

    # When modifying DALI pipeline returns, make sure you update `output_map`
    # in DALIGenericIterator invocation
    return log_features.gpu(), label.gpu(), log_features_len.gpu()
Exemplo n.º 14
0
def rnnt_train_pipe(files,
                    sample_rate,
                    pad_amount=0,
                    preemph_coeff=.97,
                    window_size=.02,
                    window_stride=.01,
                    window="hann",
                    nfeatures=64,
                    nfft=512,
                    frame_splicing_stack=1,
                    frame_splicing_subsample=1,
                    lowfreq=0.0,
                    highfreq=None,
                    normalize_type='per_feature',
                    device='cpu'):
    assert normalize_type == 'per_feature' or normalize_type == 'all_features'
    norm_axes = [1] if normalize_type == 'per_feature' else [0, 1]
    win_len, win_hop = win_args(sample_rate, window_size, window_stride)
    window_fn = torch_windows.get(window, None)
    window_fn_arg = window_fn(
        win_len, periodic=False).numpy().tolist() if window_fn else None

    data, _ = fn.readers.file(files=files,
                              device="cpu",
                              random_shuffle=False,
                              shard_id=0,
                              num_shards=1)
    audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True)

    audio_shape = fn.shapes(audio, dtype=types.INT32)
    orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, ))

    if pad_amount > 0:
        audio_len = orig_audio_len + 2 * pad_amount
    else:
        audio_len = orig_audio_len

    spec_len = audio_len // win_hop + 1

    if device == 'gpu':
        audio = audio.gpu()

    if pad_amount > 0:
        padded_audio = dali_reflect_pad_graph(audio, orig_audio_len,
                                              pad_amount)
    else:
        padded_audio = audio

    preemph_audio = fn.preemphasis_filter(padded_audio,
                                          preemph_coeff=preemph_coeff,
                                          border='zero')
    spec = fn.spectrogram(preemph_audio,
                          nfft=nfft,
                          window_fn=window_fn_arg,
                          window_length=win_len,
                          window_step=win_hop,
                          center_windows=True,
                          reflect_padding=True)
    mel_spec = fn.mel_filter_bank(spec,
                                  sample_rate=sample_rate,
                                  nfilter=nfeatures,
                                  freq_low=lowfreq,
                                  freq_high=highfreq)
    log_features = fn.to_decibels(mel_spec + 1e-20,
                                  multiplier=np.log(10),
                                  reference=1.0,
                                  cutoff_db=-80)

    if frame_splicing_stack > 1 or frame_splicing_subsample > 1:
        log_features_spliced = dali_frame_splicing_graph(
            log_features,
            nfeatures,
            spec_len,
            stacking=frame_splicing_stack,
            subsampling=frame_splicing_subsample)
    else:
        log_features_spliced = log_features

    if normalize_type:
        norm_log_features = fn.normalize(log_features_spliced,
                                         axes=norm_axes,
                                         device=device,
                                         epsilon=4e-5,
                                         ddof=1)
    else:
        norm_log_features = log_features_spliced

    return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
Exemplo n.º 15
0
        def dali_jasper_pipe():
            if is_triton_pipeline:
                assert not self.train, "Pipeline for Triton shall be a validation pipeline"
                if torch.distributed.is_initialized():
                    raise RuntimeError(
                        "You're creating Triton pipeline, using multi-process mode. Please use single-process mode."
                    )
                encoded, label = fn.external_source(device="cpu",
                                                    name="DALI_INPUT_0",
                                                    no_copy=True)
            else:
                encoded, label = fn.readers.file(
                    device="cpu",
                    name="file_reader",
                    file_root=file_root,
                    file_list=file_list,
                    shard_id=shard_id,
                    num_shards=n_shards,
                    shuffle_after_epoch=train_pipeline)

            speed_perturbation_coeffs = None
            if resample_range is not None:
                if discrete_resample_range:
                    values = [
                        self.resample_range[0], 1.0, self.resample_range[1]
                    ]
                    speed_perturbation_coeffs = fn.random.uniform(
                        device="cpu", values=values)
                else:
                    speed_perturbation_coeffs = fn.random.uniform(
                        device="cpu", range=resample_range)

            if self.train and speed_perturbation_coeffs is not None:
                dec_sample_rate_arg = speed_perturbation_coeffs * self.sample_rate
            elif resample_range is None:
                dec_sample_rate_arg = self.sample_rate
            else:
                dec_sample_rate_arg = None

            audio, _ = fn.decoders.audio(encoded,
                                         sample_rate=dec_sample_rate_arg,
                                         dtype=types.FLOAT,
                                         downmix=True)

            if self.do_remove_silence:
                begin, length = fn.nonsilent_region(
                    audio, cutoff_db=silence_threshold)
                audio = fn.slice(audio, begin, length, axes=[0])

            # Max duration drop is performed at DataLayer stage

            if self.preprocessing_device == "gpu":
                audio = audio.gpu()

            if self.dither_coeff != 0.:
                audio = audio + fn.random.normal(
                    device=preprocessing_device) * self.dither_coeff

            audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)

            spec = fn.spectrogram(audio,
                                  nfft=nfft,
                                  window_length=window_size * sample_rate,
                                  window_step=window_stride * sample_rate)

            mel_spec = fn.mel_filter_bank(spec,
                                          sample_rate=sample_rate,
                                          nfilter=self.nfeatures,
                                          normalize=True)

            log_features = fn.to_decibels(mel_spec,
                                          multiplier=np.log(10),
                                          reference=1.0,
                                          cutoff_db=math.log(1e-20))

            log_features_len = fn.shapes(log_features)
            if self.frame_splicing_factor != 1:
                log_features_len = self._div_ceil(log_features_len,
                                                  self.frame_splicing_factor)

            log_features = fn.normalize(log_features, axes=[1])
            log_features = fn.pad(log_features,
                                  axes=[1],
                                  fill_value=0,
                                  align=pad_align)

            if self.train and self._do_spectrogram_masking():
                anchors, shapes = fn.external_source(
                    source=self._cutouts_generator, num_outputs=2, cycle=True)
                log_features = fn.erase(log_features,
                                        anchor=anchors,
                                        shape=shapes,
                                        axes=[0, 1],
                                        fill_value=0,
                                        normalized_anchor=True)

            # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation
            return log_features.gpu(), label.gpu(), log_features_len.gpu()