def sparse_warp(mel_spectrogram, time_warping_para=80): """Spec augmentation Calculation Function. 'SpecAugment' have 3 steps for audio data augmentation. first step is time warping using Tensorflow's image_sparse_warp function. Second step is frequency masking, last step is time masking. # Arguments: mel_spectrogram(numpy array): audio file path of you want to warping and masking. time_warping_para(float): Augmentation parameter, "time warp parameter W". If none, default = 80 for LibriSpeech. # Returns mel_spectrogram(numpy array): warped and masked mel spectrogram. """ fbank_size = tf.shape(mel_spectrogram) n, v = fbank_size[1], fbank_size[2] # Step 1 : Time warping # Image warping control point setting. # Source pt = tf.random_uniform([], time_warping_para, n - time_warping_para, tf.int32) # radnom point along the time axis src_ctr_pt_freq = tf.range(v // 2) # control points on freq-axis src_ctr_pt_time = tf.ones_like( src_ctr_pt_freq) * pt # control points on time-axis src_ctr_pts = tf.stack((src_ctr_pt_time, src_ctr_pt_freq), -1) src_ctr_pts = tf.to_float(src_ctr_pts) # Destination w = tf.random_uniform([], -time_warping_para, time_warping_para, tf.int32) # distance dest_ctr_pt_freq = src_ctr_pt_freq dest_ctr_pt_time = src_ctr_pt_time + w dest_ctr_pts = tf.stack((dest_ctr_pt_time, dest_ctr_pt_freq), -1) dest_ctr_pts = tf.to_float(dest_ctr_pts) # warp source_control_point_locations = tf.expand_dims(src_ctr_pts, 0) # (1, v//2, 2) dest_control_point_locations = tf.expand_dims(dest_ctr_pts, 0) # (1, v//2, 2) warped_image, _ = sparse_image_warp(mel_spectrogram, source_control_point_locations, dest_control_point_locations) return warped_image
def time_warp(spec, W=3): num_rows = spec.shape[1] spec_len = spec.shape[2] device = spec.device y = num_rows // 2 horizontal_line_at_ctr = spec[0][y] assert len(horizontal_line_at_ctr) == spec_len point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len - W)] assert isinstance(point_to_warp, torch.Tensor) # Uniform distribution from (0,W) with chance to be up to W negative dist_to_warp = random.randrange(-W, W) src_pts, dest_pts = (torch.tensor([[[y, point_to_warp]]], device=device), torch.tensor([[[y, point_to_warp + dist_to_warp]]], device=device)) warped_spectro, dense_flows = sparse_image_warp(spec, src_pts, dest_pts) return warped_spectro.squeeze(3).squeeze(0)
def spec_augment(mel_spectrogram, time_warping_para, frequency_masking_para, time_masking_para, num_mask): """Spec augmentation Calculation Function. 'SpecAugment' have 3 steps for audio data augmentation. first step is time warping using Tensorflow's image_sparse_warp function. Second step is frequency masking, last step is time masking. # Arguments: input(numpy array): Extracted mel-spectrogram. time_warping_para(float): Augmentation parameter, "time warp parameter W". If none, dafault = 80 frequency_masking_para(float): Augmentation parameter, "frequency mask parameter F" If none, dafault = 100 time_masking_para(float): Augmentation parameter, "time mask parameter T" If none, dafault = 27 num_mask(float): number of masking lines. # Returns mel_spectrogram(numpy array): warped and masked mel spectrogram. """ # Step 1 : Time warping (TO DO) tau = mel_spectrogram.shape[1] # Image warping control point setting control_point_locations = np.asarray([[64, 64], [64, 80]]) control_point_locations = constant_op.constant( np.float32(np.expand_dims(control_point_locations, 0))) control_point_displacements = np.ones( control_point_locations.shape.as_list()) control_point_displacements = constant_op.constant( np.float32(control_point_displacements)) # mel spectrogram data type convert to tensor constant for sparse_image_warp mel_spectrogram = mel_spectrogram.reshape([1, mel_spectrogram.shape[0], mel_spectrogram.shape[1], 1]) mel_spectrogram_op = constant_op.constant(np.float32(mel_spectrogram)) w = random.randint(0, time_warping_para) warped_mel_spectrogram_op, _ = sparse_image_warp(mel_spectrogram_op, source_control_point_locations=control_point_locations, dest_control_point_locations=control_point_locations + control_point_displacements, interpolation_order=2, regularization_weight=0, num_boundary_points=0 ) # Change data type of warp result to numpy array for masking step with tf.Session() as sess: warped_mel_spectrogram = sess.run(warped_mel_spectrogram_op) warped_mel_spectrogram = warped_mel_spectrogram.reshape([warped_mel_spectrogram.shape[1], warped_mel_spectrogram.shape[2]]) # loop Masking line number for i in range(num_mask): # Step 2 : Frequency masking f = np.random.uniform(low=0.0, high=frequency_masking_para) f = int(f) v = 128 # Now hard coding but I will improve soon. f0 = random.randint(0, v - f) warped_mel_spectrogram[f0:f0 + f, :] = 0 # Step 3 : Time masking t = np.random.uniform(low=0.0, high=time_masking_para) t = int(t) t0 = random.randint(0, tau - t) warped_mel_spectrogram[:, t0:t0 + t] = 0 return warped_mel_spectrogram
def spec_augment(mel_spectrogram, time_warping_para=80, frequency_masking_para=27, time_masking_para=100, frequency_mask_num=1, time_mask_num=1): """Spec augmentation Calculation Function. 'SpecAugment' have 3 steps for audio data augmentation. first step is time warping using Tensorflow's image_sparse_warp function. Second step is frequency masking, last step is time masking. # Arguments: mel_spectrogram(numpy array): audio file path of you want to warping and masking. time_warping_para(float): Augmentation parameter, "time warp parameter W". If none, default = 80 for LibriSpeech. frequency_masking_para(float): Augmentation parameter, "frequency mask parameter F" If none, default = 100 for LibriSpeech. time_masking_para(float): Augmentation parameter, "time mask parameter T" If none, default = 27 for LibriSpeech. frequency_mask_num(float): number of frequency masking lines, "m_F". If none, default = 1 for LibriSpeech. time_mask_num(float): number of time masking lines, "m_T". If none, default = 1 for LibriSpeech. # Returns mel_spectrogram(numpy array): warped and masked mel spectrogram. """ v = mel_spectrogram.shape[0] tau = mel_spectrogram.shape[1] # Step 1 : Time warping # Image warping control point setting. mel_spectrogram_holder = tf.placeholder(tf.float32, shape=[1, v, tau, 1]) location_holder = tf.placeholder(tf.float32, shape=[1, 1, 2]) destination_holder = tf.placeholder(tf.float32, shape=[1, 1, 2]) center_position = v / 2 random_point = np.random.randint(low=time_warping_para, high=tau - time_warping_para) # warping distance chose. w = np.random.uniform(low=0, high=time_warping_para) control_point_locations = [[center_position, random_point]] control_point_locations = np.float32( np.expand_dims(control_point_locations, 0)) control_point_destination = [[center_position, random_point + w]] control_point_destination = np.float32( np.expand_dims(control_point_destination, 0)) # mel spectrogram data type convert to tensor constant for sparse_image_warp. mel_spectrogram = mel_spectrogram.reshape( [1, mel_spectrogram.shape[0], mel_spectrogram.shape[1], 1]) mel_spectrogram = np.float32(mel_spectrogram) warped_mel_spectrogram_op, _ = sparse_image_warp( mel_spectrogram_holder, source_control_point_locations=location_holder, dest_control_point_locations=destination_holder, interpolation_order=2, regularization_weight=0, num_boundary_points=1) # Change warp result's data type to numpy array for masking step. feed_dict = { mel_spectrogram_holder: mel_spectrogram, location_holder: control_point_locations, destination_holder: control_point_destination } with tf.Session() as sess: warped_mel_spectrogram = sess.run(warped_mel_spectrogram_op, feed_dict=feed_dict) warped_mel_spectrogram = warped_mel_spectrogram.reshape( [warped_mel_spectrogram.shape[1], warped_mel_spectrogram.shape[2]]) # Step 2 : Frequency masking for i in range(frequency_mask_num): f = np.random.uniform(low=0.0, high=frequency_masking_para) f = int(f) f0 = random.randint(0, v - f) warped_mel_spectrogram[f0:f0 + f, :] = 0 # Step 3 : Time masking for i in range(time_mask_num): t = np.random.uniform(low=0.0, high=time_masking_para) t = int(t) t0 = random.randint(0, tau - t) warped_mel_spectrogram[:, t0:t0 + t] = 0 return warped_mel_spectrogram