def samples_to_mfccs_orig(samples, sample_rate, train_phase=False): #tf.print('window_size: ', Config.audio_window_samples, ' stride: ', Config.audio_step_samples) spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) #tf.print('dct_count: ', Config.n_input) return mfccs, tf.shape(input=mfccs)[0]
def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None): if train_phase: # We need the lambdas to make TensorFlow happy. # pylint: disable=unnecessary-lambda tf.cond(tf.math.not_equal(sample_rate, FLAGS.audio_sample_rate), lambda: tf.print('WARNING: sample rate of sample', sample_id, '(', sample_rate, ') ' 'does not match FLAGS.audio_sample_rate. This can lead to incorrect results.'), lambda: tf.no_op(), name='matching_sample_rate') spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) # sparse warp must before freq/time masking if FLAGS.augmentation_sparse_warp: spectrogram = augment_sparse_warp(spectrogram, time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para, interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order, regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight, num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points, num_control_points=FLAGS.augmentation_sparse_warp_num_control_points) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=Config.n_input, upper_frequency_limit=FLAGS.audio_sample_rate/2) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(input=mfccs)[0]
def samples_to_mfccs(samples, sample_rate, train_phase=False): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) # sparse warp must before freq/time masking if FLAGS.augmentation_sparse_warp: spectrogram = augment_sparse_warp(spectrogram, time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para, interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order, regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight, num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points, num_control_points=FLAGS.augmentation_sparse_warp_num_control_points) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=Config.n_input, upper_frequency_limit=FLAGS.audio_sample_rate/2) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(input=mfccs)[0]