示例#1
0
def test_audio_dataset():
  """Test Audio Dataset"""
  with open(audio_path, 'rb') as f:
    wav_contents = f.read()
  audio_p = audio.decode_wav(wav_contents)
  with tf.compat.v1.Session() as sess:
    audio_v = sess.run(audio_p).audio

  f = lambda x: float(x) / (1 << 15)

  dataset = audio_io.WAVDataset([audio_path])
  iterator = dataset.make_initializable_iterator()
  init_op = iterator.initializer
  get_next = iterator.get_next()
  with tf.compat.v1.Session() as sess:
    sess.run(init_op)
    for i in range(audio_v.shape[0]):
      v = sess.run(get_next)
      assert audio_v[i] == f(v)
    with pytest.raises(errors.OutOfRangeError):
      sess.run(get_next)

  dataset = audio_io.WAVDataset([audio_path], batch=2)
  iterator = dataset.make_initializable_iterator()
  init_op = iterator.initializer
  get_next = iterator.get_next()
  with tf.compat.v1.Session() as sess:
    sess.run(init_op)
    for i in range(0, audio_v.shape[0], 2):
      v = sess.run(get_next)
      assert audio_v[i] == f(v[0])
      assert audio_v[i + 1] == f(v[1])
    with pytest.raises(errors.OutOfRangeError):
      sess.run(get_next)
    def get_unprocessed_data(self, how_many, model_settings, mode):
        """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation',
        'testing' or 'pseudo'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
        candidates = self.data_index[mode]
        if how_many == -1:
            sample_count = len(candidates)
        else:
            sample_count = how_many
        desired_samples = model_settings['desired_samples']
        words_list = self.words_list
        data = np.zeros((sample_count, desired_samples))
        labels = []
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [],
                                                      name='filename')
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            foreground_volume_placeholder = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(wav_decoder.audio,
                                            foreground_volume_placeholder)
            for i in range(sample_count):
                if how_many == -1:
                    sample_index = i
                else:
                    sample_index = np.random.randint(len(candidates))
                sample = candidates[sample_index]
                input_dict = {wav_filename_placeholder: sample['file']}
                if sample['label'] == SILENCE_LABEL:
                    input_dict[foreground_volume_placeholder] = 0
                else:
                    input_dict[foreground_volume_placeholder] = 1
                data[i, :] = sess.run(scaled_foreground,
                                      feed_dict=input_dict).flatten()
                label_index = self.word_to_index[sample['label']]
                labels.append(words_list[label_index])
        return data, labels
def load_wav_file(filename):
    """Loads an audio file and returns a float PCM-encoded array of samples.

  Args:
    filename: Path to the .wav file to load.

  Returns:
    Numpy array holding the sample data as floats between -1.0 and 1.0.
  """
    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = audio.decode_wav(wav_loader, desired_channels=1)
        return sess.run(wav_decoder,
                        feed_dict={
                            wav_filename_placeholder: filename
                        }).audio.flatten()
示例#4
0
 def __init__(
         self, desired_samples=16000,
         window_size_samples=480, window_stride_samples=160):
   self.wav_filename_placeholder = tf.placeholder(tf.string, [])
   wav_loader = io_ops.read_file(self.wav_filename_placeholder)
   # already pads/crops
   wav_decoder = contrib_audio.decode_wav(
       wav_loader, desired_channels=1, desired_samples=desired_samples)
   spectrogram = contrib_audio.audio_spectrogram(
       wav_decoder.audio,
       window_size=window_size_samples,
       stride=window_stride_samples,
       magnitude_squared=True)
   self.mfcc = contrib_audio.mfcc(
       spectrogram,
       wav_decoder.sample_rate,
       dct_coefficient_count=40)
    def prepare_background_data(self):
        """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that
    match the sample rate of the training data, but can be much longer in
    duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """
        self.background_data = []
        background_dir = os.path.join(self.data_dirs[0],
                                      BACKGROUND_NOISE_DIR_NAME)
        if not os.path.exists(background_dir):
            return self.background_data
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = contrib_audio.decode_wav(wav_loader,
                                                   desired_channels=1)
            search_path = os.path.join(self.data_dirs[0],
                                       BACKGROUND_NOISE_DIR_NAME, '*.wav')
            for wav_path in gfile.Glob(search_path):
                wav_data = sess.run(wav_decoder,
                                    feed_dict={
                                        wav_filename_placeholder: wav_path
                                    }).audio.flatten()
                self.background_data.append(wav_data)
            if not self.background_data:
                raise Exception('No background wav files were found in ' +
                                search_path)
示例#6
0
def song_vectors(song_dir):

    dir_name = '../tensors/'
    with open('song_vectors.txt', 'w') as f:
        f_index = 0
        for file_name in os.listdir(song_dir):
            raw_audio = io.read_file(song_dir + file_name)
            song_vector, sample_rate = audio.decode_wav(raw_audio,
                                                        desired_samples=100000)

            song_pickle = open(dir_name + 'song_tensor' + str(f_index), 'wb')
            rate_pickle = open(dir_name + 'rate_tensor' + str(f_index), 'wb')
            pickle.dump(song_vector, song_pickle)
            pickle.dump(sample_rate, rate_pickle)

            song_pickle.close()
            rate_pickle.close()

            f.write(str(sample_rate) + ':')
            for tensor in song_vector:
                f.write(str(tensor))
            f.write('\n')
            f_index += 1
示例#7
0
def get_next_batch(curr_batch, songs_per_batch, sess, verbose=False):
    wav_arr_ch1 = []
    wav_arr_ch2 = []
    if (curr_batch) >= (len(file_arr)):
        curr_batch = 0

    start_position = curr_batch * songs_per_batch
    end_position = start_position + songs_per_batch
    for idx in range(start_position, end_position):
        audio_binary = tf.io.read_file(file_arr[idx])
        wav_decoder = audio_ops.decode_wav(audio_binary, desired_channels=2)
        sample_rate, audio = sess.run(
            [wav_decoder.sample_rate, wav_decoder.audio])
        audio = np.array(audio)

        if len(audio[:, 0]) != 5292000:
            continue

        wav_arr_ch1.append(rfft(audio[:, 0]))
        wav_arr_ch2.append(rfft(audio[:, 1]))
        if verbose:
            print("Returning File: " + file_arr[idx])

    return wav_arr_ch1, wav_arr_ch2, sample_rate
示例#8
0
def get_next_batch(curr_batch, songs_per_batch, sess):
  wav_arr_ch1 = []
  wav_arr_ch2 = []
  if (curr_batch) >= (len(file_arr)):
    curr_batch = 0
  start_position = curr_batch * songs_per_batch
  end_position = start_position + songs_per_batch
  for idx in range(start_position, end_position):
    os.system('bash mktrainwav.sh %d trainsample.wav' % idx)
    audio_binary = tf.read_file('trainsample.wav')
    os.remove('trainsample.wav')
    wav_decoder = decode_wav(
      audio_binary, desired_channels=1
    )
    sample_rate, audio = sess.run([
      wav_decoder.sample_rate, 
      wav_decoder.audio
    ])
    audio = np.array(audio)    # We want to ensure that every song we look at has the same number of samples!
    if len(audio[:, 0]) != SAMPLES_CNT: 
      continue
    wav_arr.append(rfft(audio[:,0]))
  print("Returning File: " + file_arr[idx])
  return wav_arr, sample_rate
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = audio_ops.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = audio_ops.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                tf.summary.image('micro',
                                 tf.expand_dims(
                                     tf.expand_dims(self.output_, -1), 0),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.summary.FileWriter(
                    summaries_dir + '/data', tf.get_default_graph())
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_placeholder_: How much the clip is shifted.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [],
                                                        name='filename')
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='foreground_volme')
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_placeholder_ = tf.placeholder(tf.int32,
                                                      name='timeshift')
        # TODO(see--): Write test with np.roll
        shifted_foreground = tf_roll(scaled_foreground,
                                     self.time_shift_placeholder_)
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1], name='background_data')
        self.background_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='background_volume')
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, shifted_foreground)
        # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
        self.background_clamp_ = background_add
        self.background_clamp_ = tf.reshape(
            self.background_clamp_, (1, model_settings['desired_samples']))
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        #stfts = tf.contrib.signal.stft(
        stfts = tf.signal.stft(
            self.background_clamp_,
            frame_length=model_settings['window_size_samples'],
            frame_step=model_settings['window_stride_samples'],
            fft_length=None)
        self.spectrogram_ = tf.abs(stfts)
        num_spectrogram_bins = self.spectrogram_.shape[-1].value
        lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
        linear_to_mel_weight_matrix = \
            tf.signal.linear_to_mel_weight_matrix(
                model_settings['dct_coefficient_count'],
                num_spectrogram_bins, model_settings['sample_rate'],
                lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(self.spectrogram_,
                                        linear_to_mel_weight_matrix, 1)
        mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
        self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms(
            log_mel_spectrograms
        )[:, :, :model_settings['num_log_mel_features']]  # :13