Exemplo n.º 1
0
def random_stretch_squeeze(inputs,
                           resample_offset,
                           seed=None):
  """Stretches and squeezes audio data in time dim.

  It can be useful for augmenting training data
  with random stretchs squeezes in time dim
  for making model more robust to input audio sampling frequency
  and human speech frequency.

  Args:
    inputs: input tensor [batch_size, time]
    resample_offset: defines stretch squeeze range:
      1-resample_offset...1+resample_offset
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
  if inputs.shape.rank != 2:
    raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

  inputs_shape = inputs.shape.as_list()
  batch_size = inputs_shape[0]
  sequence_length = inputs_shape[1]

  image = tf.expand_dims(inputs, 2)  # feature
  image = tf.expand_dims(image, 3)  # channels

  resample = 1.0  # when it is equal to 1 - no stretching or squeezing
  time_stretch_squeeze = tf.random.uniform(
      shape=[batch_size],
      minval=resample - resample_offset,
      maxval=resample + resample_offset,
      dtype=tf.float32,
      seed=seed)
  tf.print(time_stretch_squeeze)
  print(time_stretch_squeeze)
  shape = tf.shape(inputs)
  outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(batch_size):
    image_resized = tf.image.resize(
        images=image[i],
        size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]),
                      tf.int32), 1),
        preserve_aspect_ratio=False)
    image_resized_cropped = tf.image.resize_with_crop_or_pad(
        image_resized,
        target_height=sequence_length,
        target_width=1,
    )

    outputs = outputs.write(i, image_resized_cropped)

  outputs = tf.squeeze(outputs.stack(), axis=[2, 3])
  outputs.set_shape(inputs_shape)
  return outputs
Exemplo n.º 2
0
def random_shift(inputs, time_shift, seed=None):
    """Shifts input data randomly in time dim.

  It can be useful for augmenting training data with random shifts in time dim
  for making model more robust to input audio shifts

  Args:
    inputs: input tensor [batch_size, time]
    time_shift: defines time shift range: -time_shift...time_shift
      it is defiend in samples
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
    if inputs.shape.rank != 2:
        raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

    inputs_shape = inputs.shape.as_list()
    batch_size = inputs_shape[0]
    sequence_length = inputs_shape[1]

    # below function will process 2D arrays, convert it to [batch, time, dummy]
    inputs = tf.expand_dims(inputs, 2)

    time_shift_amounts = tf.random.uniform(shape=[batch_size],
                                           minval=-time_shift,
                                           maxval=time_shift,
                                           dtype=tf.int32,
                                           seed=seed)

    outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
    for i in tf.range(batch_size):
        time_shift_amount = time_shift_amounts[i]

        # pylint: disable=cell-var-from-loop
        time_shift_padding = tf.cond(time_shift_amount > 0,
                                     lambda: [[time_shift_amount, 0], [0, 0]],
                                     lambda: [[0, -time_shift_amount], [0, 0]])
        time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0],
                                    lambda: [-time_shift_amount, 0])
        # pylint: enable=cell-var-from-loop

        padded = tf.pad(tensor=inputs[i],
                        paddings=time_shift_padding,
                        mode='CONSTANT')
        padded_sliced = tf.slice(padded, time_shift_offset,
                                 [sequence_length, -1])

        outputs = outputs.write(i, padded_sliced)

    # convert it back to [batch, time]
    outputs = tf.squeeze(outputs.stack(), axis=[2])
    outputs.set_shape(inputs_shape)
    return outputs
Exemplo n.º 3
0
def random_cutout(
    inputs,
    mask_size,
    mask_value=0,
    seed=None,
    data_format='channels_last',
):
  """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs.

  It is based on addons/tensorflow_addons/image/cutout_ops.py
  kept here here for backward compatibility

  Args:
    inputs: input tensor [batch_size, time, feature, channels]
    mask_size: mask size (time feature)
    mask_value: mask will be filled with this value
    seed: random seed
    data_format: dimesnions order
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 4
  """

  if inputs.shape.rank != 4:
    raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank)

  mask_size = tf.convert_to_tensor(mask_size)
  if tf.rank(mask_size) == 0:
    mask_size = tf.stack([mask_size, mask_size])

  if data_format == 'channels_last':
    time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2]
  else:
    time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3]

  batch_size = tf.shape(inputs)[0]

  cutout_center_time = tf.random.uniform(
      shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed
  )
  cutout_center_feature = tf.random.uniform(
      shape=[batch_size],
      minval=0,
      maxval=feature_size,
      dtype=tf.int32,
      seed=seed)
  offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0])
  origin_shape = inputs.shape
  offset = tf.convert_to_tensor(offset)
  mask_size = mask_size // 2
  cutout_center_time = offset[:, 0]
  cutout_center_feature = offset[:, 1]

  lower_pads = tf.maximum(0, cutout_center_time - mask_size[0])
  upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0])
  left_pads = tf.maximum(0, cutout_center_feature - mask_size[1])
  right_pads = tf.maximum(0,
                          feature_size - cutout_center_feature - mask_size[1])

  cutout_shape = tf.transpose(
      [
          time_size - (lower_pads + upper_pads),
          feature_size - (left_pads + right_pads),
      ],
      [1, 0],
  )
  masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(tf.shape(cutout_shape)[0]):
    padding_dims = [
        [lower_pads[i], upper_pads[i]],
        [left_pads[i], right_pads[i]],
    ]
    mask = tf.pad(
        tf.zeros(cutout_shape[i], dtype=inputs.dtype),
        padding_dims,
        constant_values=1,
    )
    masks = masks.write(i, mask)

  if data_format == 'channels_last':
    mask = tf.expand_dims(masks.stack(), -1)
    mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]])
  else:
    mask = tf.expand_dims(masks.stack(), 1)
    mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1])

  inputs = tf.where(
      tf.equal(mask, 0),
      tf.ones_like(inputs, dtype=inputs.dtype) * mask_value,
      inputs,
  )
  inputs.set_shape(origin_shape)
  return inputs