def test_audio_dataset(): """Test Audio Dataset""" with open(audio_path, 'rb') as f: wav_contents = f.read() audio_p = audio.decode_wav(wav_contents) with tf.compat.v1.Session() as sess: audio_v = sess.run(audio_p).audio f = lambda x: float(x) / (1 << 15) dataset = audio_io.WAVDataset([audio_path]) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with tf.compat.v1.Session() as sess: sess.run(init_op) for i in range(audio_v.shape[0]): v = sess.run(get_next) assert audio_v[i] == f(v) with pytest.raises(errors.OutOfRangeError): sess.run(get_next) dataset = audio_io.WAVDataset([audio_path], batch=2) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with tf.compat.v1.Session() as sess: sess.run(init_op) for i in range(0, audio_v.shape[0], 2): v = sess.run(get_next) assert audio_v[i] == f(v[0]) assert audio_v[i + 1] == f(v[1]) with pytest.raises(errors.OutOfRangeError): sess.run(get_next)
def get_unprocessed_data(self, how_many, model_settings, mode): """Retrieve sample data for the given partition, with no transformations. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. model_settings: Information about the current model being trained. mode: Which partition to use, must be 'training', 'validation', 'testing' or 'pseudo'. Returns: List of sample data for the samples, and list of labels in one-hot form. """ candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, [], name='filename') wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def load_wav_file(filename): """Loads an audio file and returns a float PCM-encoded array of samples. Args: filename: Path to the .wav file to load. Returns: Numpy array holding the sample data as floats between -1.0 and 1.0. """ with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = audio.decode_wav(wav_loader, desired_channels=1) return sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: filename }).audio.flatten()
def __init__( self, desired_samples=16000, window_size_samples=480, window_stride_samples=160): self.wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder) # already pads/crops wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) self.mfcc = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=40)
def prepare_background_data(self): """Searches a folder for background noise audio, and loads it into memory. It's expected that the background audio samples will be in a subdirectory named '_background_noise_' inside the 'data_dir' folder, as .wavs that match the sample rate of the training data, but can be much longer in duration. If the '_background_noise_' folder doesn't exist at all, this isn't an error, it's just taken to mean that no background noise augmentation should be used. If the folder does exist, but it's empty, that's treated as an error. Returns: List of raw PCM-encoded audio samples of background noise. Raises: Exception: If files aren't found in the folder. """ self.background_data = [] background_dir = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME) if not os.path.exists(background_dir): return self.background_data with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) search_path = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME, '*.wav') for wav_path in gfile.Glob(search_path): wav_data = sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: wav_path }).audio.flatten() self.background_data.append(wav_data) if not self.background_data: raise Exception('No background wav files were found in ' + search_path)
def song_vectors(song_dir): dir_name = '../tensors/' with open('song_vectors.txt', 'w') as f: f_index = 0 for file_name in os.listdir(song_dir): raw_audio = io.read_file(song_dir + file_name) song_vector, sample_rate = audio.decode_wav(raw_audio, desired_samples=100000) song_pickle = open(dir_name + 'song_tensor' + str(f_index), 'wb') rate_pickle = open(dir_name + 'rate_tensor' + str(f_index), 'wb') pickle.dump(song_vector, song_pickle) pickle.dump(sample_rate, rate_pickle) song_pickle.close() rate_pickle.close() f.write(str(sample_rate) + ':') for tensor in song_vector: f.write(str(tensor)) f.write('\n') f_index += 1
def get_next_batch(curr_batch, songs_per_batch, sess, verbose=False): wav_arr_ch1 = [] wav_arr_ch2 = [] if (curr_batch) >= (len(file_arr)): curr_batch = 0 start_position = curr_batch * songs_per_batch end_position = start_position + songs_per_batch for idx in range(start_position, end_position): audio_binary = tf.io.read_file(file_arr[idx]) wav_decoder = audio_ops.decode_wav(audio_binary, desired_channels=2) sample_rate, audio = sess.run( [wav_decoder.sample_rate, wav_decoder.audio]) audio = np.array(audio) if len(audio[:, 0]) != 5292000: continue wav_arr_ch1.append(rfft(audio[:, 0])) wav_arr_ch2.append(rfft(audio[:, 1])) if verbose: print("Returning File: " + file_arr[idx]) return wav_arr_ch1, wav_arr_ch2, sample_rate
def get_next_batch(curr_batch, songs_per_batch, sess): wav_arr_ch1 = [] wav_arr_ch2 = [] if (curr_batch) >= (len(file_arr)): curr_batch = 0 start_position = curr_batch * songs_per_batch end_position = start_position + songs_per_batch for idx in range(start_position, end_position): os.system('bash mktrainwav.sh %d trainsample.wav' % idx) audio_binary = tf.read_file('trainsample.wav') os.remove('trainsample.wav') wav_decoder = decode_wav( audio_binary, desired_channels=1 ) sample_rate, audio = sess.run([ wav_decoder.sample_rate, wav_decoder.audio ]) audio = np.array(audio) # We want to ensure that every song we look at has the same number of samples! if len(audio[:, 0]) != SAMPLES_CNT: continue wav_arr.append(rfft(audio[:,0])) print("Returning File: " + file_arr[idx]) return wav_arr, sample_rate
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image('micro', tf.expand_dims( tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if summaries_dir: self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_placeholder_: How much the clip is shifted. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') # TODO(see--): Write test with np.roll shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape( self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. #stfts = tf.contrib.signal.stft( stfts = tf.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) num_spectrogram_bins = self.spectrogram_.shape[-1].value lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms )[:, :, :model_settings['num_log_mel_features']] # :13