Exemplo n.º 1
0
def sparse_warp(mel_spectrogram, time_warping_para=80):
    """Spec augmentation Calculation Function.

    'SpecAugment' have 3 steps for audio data augmentation.
    first step is time warping using Tensorflow's image_sparse_warp function.
    Second step is frequency masking, last step is time masking.

    # Arguments:
      mel_spectrogram(numpy array): audio file path of you want to warping and masking.
      time_warping_para(float): Augmentation parameter, "time warp parameter W".
        If none, default = 80 for LibriSpeech.

    # Returns
      mel_spectrogram(numpy array): warped and masked mel spectrogram.
    """

    fbank_size = tf.shape(mel_spectrogram)
    n, v = fbank_size[1], fbank_size[2]

    # Step 1 : Time warping
    # Image warping control point setting.
    # Source
    pt = tf.random_uniform([], time_warping_para, n - time_warping_para,
                           tf.int32)  # radnom point along the time axis
    src_ctr_pt_freq = tf.range(v // 2)  # control points on freq-axis
    src_ctr_pt_time = tf.ones_like(
        src_ctr_pt_freq) * pt  # control points on time-axis
    src_ctr_pts = tf.stack((src_ctr_pt_time, src_ctr_pt_freq), -1)
    src_ctr_pts = tf.to_float(src_ctr_pts)

    # Destination
    w = tf.random_uniform([], -time_warping_para, time_warping_para,
                          tf.int32)  # distance
    dest_ctr_pt_freq = src_ctr_pt_freq
    dest_ctr_pt_time = src_ctr_pt_time + w
    dest_ctr_pts = tf.stack((dest_ctr_pt_time, dest_ctr_pt_freq), -1)
    dest_ctr_pts = tf.to_float(dest_ctr_pts)

    # warp
    source_control_point_locations = tf.expand_dims(src_ctr_pts,
                                                    0)  # (1, v//2, 2)
    dest_control_point_locations = tf.expand_dims(dest_ctr_pts,
                                                  0)  # (1, v//2, 2)

    warped_image, _ = sparse_image_warp(mel_spectrogram,
                                        source_control_point_locations,
                                        dest_control_point_locations)
    return warped_image
def time_warp(spec, W=3):
    num_rows = spec.shape[1]
    spec_len = spec.shape[2]
    device = spec.device

    y = num_rows // 2
    horizontal_line_at_ctr = spec[0][y]
    assert len(horizontal_line_at_ctr) == spec_len

    point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len - W)]
    assert isinstance(point_to_warp, torch.Tensor)

    # Uniform distribution from (0,W) with chance to be up to W negative
    dist_to_warp = random.randrange(-W, W)
    src_pts, dest_pts = (torch.tensor([[[y, point_to_warp]]], device=device),
                         torch.tensor([[[y, point_to_warp + dist_to_warp]]],
                                      device=device))
    warped_spectro, dense_flows = sparse_image_warp(spec, src_pts, dest_pts)
    return warped_spectro.squeeze(3).squeeze(0)
Exemplo n.º 3
0
def spec_augment(mel_spectrogram, time_warping_para, frequency_masking_para, time_masking_para, num_mask):
    """Spec augmentation Calculation Function.

    'SpecAugment' have 3 steps for audio data augmentation.
    first step is time warping using Tensorflow's image_sparse_warp function.
    Second step is frequency masking, last step is time masking.

    # Arguments:
      input(numpy array): Extracted mel-spectrogram.
      time_warping_para(float): Augmentation parameter, "time warp parameter W".
        If none, dafault = 80
      frequency_masking_para(float): Augmentation parameter, "frequency mask parameter F"
        If none, dafault = 100
      time_masking_para(float): Augmentation parameter, "time mask parameter T"
        If none, dafault = 27
      num_mask(float): number of masking lines.

    # Returns
      mel_spectrogram(numpy array): warped and masked mel spectrogram.
    """

    # Step 1 : Time warping (TO DO)
    tau = mel_spectrogram.shape[1]

    # Image warping control point setting
    control_point_locations = np.asarray([[64, 64], [64, 80]])
    control_point_locations = constant_op.constant(
        np.float32(np.expand_dims(control_point_locations, 0)))

    control_point_displacements = np.ones(
        control_point_locations.shape.as_list())
    control_point_displacements = constant_op.constant(
        np.float32(control_point_displacements))

    # mel spectrogram data type convert to tensor constant for sparse_image_warp
    mel_spectrogram = mel_spectrogram.reshape([1, mel_spectrogram.shape[0], mel_spectrogram.shape[1], 1])
    mel_spectrogram_op = constant_op.constant(np.float32(mel_spectrogram))
    w = random.randint(0, time_warping_para)

    warped_mel_spectrogram_op, _ = sparse_image_warp(mel_spectrogram_op,
                                                     source_control_point_locations=control_point_locations,
                                                     dest_control_point_locations=control_point_locations + control_point_displacements,
                                                     interpolation_order=2,
                                                     regularization_weight=0,
                                                     num_boundary_points=0
                                                     )

    # Change data type of warp result to numpy array for masking step
    with tf.Session() as sess:
        warped_mel_spectrogram = sess.run(warped_mel_spectrogram_op)

    warped_mel_spectrogram = warped_mel_spectrogram.reshape([warped_mel_spectrogram.shape[1],
                                                             warped_mel_spectrogram.shape[2]])

    # loop Masking line number
    for i in range(num_mask):
        # Step 2 : Frequency masking
        f = np.random.uniform(low=0.0, high=frequency_masking_para)
        f = int(f)
        v = 128  # Now hard coding but I will improve soon.
        f0 = random.randint(0, v - f)
        warped_mel_spectrogram[f0:f0 + f, :] = 0

        # Step 3 : Time masking
        t = np.random.uniform(low=0.0, high=time_masking_para)
        t = int(t)
        t0 = random.randint(0, tau - t)
        warped_mel_spectrogram[:, t0:t0 + t] = 0

    return warped_mel_spectrogram
Exemplo n.º 4
0
def spec_augment(mel_spectrogram,
                 time_warping_para=80,
                 frequency_masking_para=27,
                 time_masking_para=100,
                 frequency_mask_num=1,
                 time_mask_num=1):
    """Spec augmentation Calculation Function.
    'SpecAugment' have 3 steps for audio data augmentation.
    first step is time warping using Tensorflow's image_sparse_warp function.
    Second step is frequency masking, last step is time masking.
    # Arguments:
      mel_spectrogram(numpy array): audio file path of you want to warping and masking.
      time_warping_para(float): Augmentation parameter, "time warp parameter W".
        If none, default = 80 for LibriSpeech.
      frequency_masking_para(float): Augmentation parameter, "frequency mask parameter F"
        If none, default = 100 for LibriSpeech.
      time_masking_para(float): Augmentation parameter, "time mask parameter T"
        If none, default = 27 for LibriSpeech.
      frequency_mask_num(float): number of frequency masking lines, "m_F".
        If none, default = 1 for LibriSpeech.
      time_mask_num(float): number of time masking lines, "m_T".
        If none, default = 1 for LibriSpeech.
    # Returns
      mel_spectrogram(numpy array): warped and masked mel spectrogram.
    """
    v = mel_spectrogram.shape[0]
    tau = mel_spectrogram.shape[1]

    # Step 1 : Time warping
    # Image warping control point setting.
    mel_spectrogram_holder = tf.placeholder(tf.float32, shape=[1, v, tau, 1])
    location_holder = tf.placeholder(tf.float32, shape=[1, 1, 2])
    destination_holder = tf.placeholder(tf.float32, shape=[1, 1, 2])

    center_position = v / 2
    random_point = np.random.randint(low=time_warping_para,
                                     high=tau - time_warping_para)
    # warping distance chose.
    w = np.random.uniform(low=0, high=time_warping_para)

    control_point_locations = [[center_position, random_point]]
    control_point_locations = np.float32(
        np.expand_dims(control_point_locations, 0))

    control_point_destination = [[center_position, random_point + w]]
    control_point_destination = np.float32(
        np.expand_dims(control_point_destination, 0))

    # mel spectrogram data type convert to tensor constant for sparse_image_warp.
    mel_spectrogram = mel_spectrogram.reshape(
        [1, mel_spectrogram.shape[0], mel_spectrogram.shape[1], 1])
    mel_spectrogram = np.float32(mel_spectrogram)

    warped_mel_spectrogram_op, _ = sparse_image_warp(
        mel_spectrogram_holder,
        source_control_point_locations=location_holder,
        dest_control_point_locations=destination_holder,
        interpolation_order=2,
        regularization_weight=0,
        num_boundary_points=1)

    # Change warp result's data type to numpy array for masking step.
    feed_dict = {
        mel_spectrogram_holder: mel_spectrogram,
        location_holder: control_point_locations,
        destination_holder: control_point_destination
    }

    with tf.Session() as sess:
        warped_mel_spectrogram = sess.run(warped_mel_spectrogram_op,
                                          feed_dict=feed_dict)

    warped_mel_spectrogram = warped_mel_spectrogram.reshape(
        [warped_mel_spectrogram.shape[1], warped_mel_spectrogram.shape[2]])

    # Step 2 : Frequency masking
    for i in range(frequency_mask_num):
        f = np.random.uniform(low=0.0, high=frequency_masking_para)
        f = int(f)
        f0 = random.randint(0, v - f)
        warped_mel_spectrogram[f0:f0 + f, :] = 0

    # Step 3 : Time masking
    for i in range(time_mask_num):
        t = np.random.uniform(low=0.0, high=time_masking_para)
        t = int(t)
        t0 = random.randint(0, tau - t)
        warped_mel_spectrogram[:, t0:t0 + t] = 0

    return warped_mel_spectrogram