Пример #1
0
  def _evalDecodeJpeg(self, image_name, parallelism, num_iters, tile=None):
    """Evaluate DecodeJpegOp for the given image.

    TODO(tanmingxing): add decoding+cropping as well.

    Args:
      image_name: a string of image file name (without suffix).
      parallelism: the number of concurrent decode_jpeg ops to be run.
      num_iters: number of iterations for evaluation.
      tile: if not None, tile the image to composite a larger fake image.

    Returns:
      The duration of the run in seconds.
    """
    ops.reset_default_graph()

    image_file_path = os.path.join(prefix_path, image_name)

    if tile is None:
      image_content = variable_scope.get_variable(
          'image_%s' % image_name,
          initializer=io_ops.read_file(image_file_path))
    else:
      single_image = image_ops.decode_jpeg(
          io_ops.read_file(image_file_path), channels=3, name='single_image')
      # Tile the image to composite a new larger image.
      tiled_image = array_ops.tile(single_image, tile)
      image_content = variable_scope.get_variable(
          'tiled_image_%s' % image_name,
          initializer=image_ops.encode_jpeg(tiled_image))

    with session.Session() as sess:
      sess.run(variables.global_variables_initializer())
      images = []
      for i in xrange(parallelism):
        images.append(
            image_ops.decode_jpeg(
                image_content, channels=3, name='image_%d' % (i)))

      r = control_flow_ops.group(*images)

      for _ in xrange(3):
        # Skip warm up time.
        sess.run(r)

      start_time = time.time()
      for _ in xrange(num_iters):
        sess.run(r)
    return time.time() - start_time
Пример #2
0
 def testCmyk(self):
     # Confirm that CMYK reads in as RGB
     base = "tensorflow/core/lib/jpeg/testdata"
     rgb_path = os.path.join(base, "jpeg_merge_test1.jpg")
     cmyk_path = os.path.join(base, "jpeg_merge_test1_cmyk.jpg")
     shape = 256, 128, 3
     for channels in 3, 0:
         with self.test_session() as sess:
             rgb = image_ops.decode_jpeg(io_ops.read_file(rgb_path), channels=channels)
             cmyk = image_ops.decode_jpeg(io_ops.read_file(cmyk_path), channels=channels)
             rgb, cmyk = sess.run([rgb, cmyk])
             self.assertEqual(rgb.shape, shape)
             self.assertEqual(cmyk.shape, shape)
             error = self.averageError(rgb, cmyk)
             self.assertLess(error, 4)
  def testGif(self):
    # Read some real GIFs
    path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
    WIDTH = 20
    HEIGHT = 40
    STRIDE = 5
    shape = (12, HEIGHT, WIDTH, 3)

    with self.test_session(use_gpu=True) as sess:
      gif0 = io_ops.read_file(path)
      image0 = image_ops.decode_image(gif0)
      image1 = image_ops.decode_gif(gif0)
      gif0, image0, image1 = sess.run([gif0, image0, image1])

      self.assertEqual(image0.shape, shape)
      self.assertAllEqual(image0, image1)

      for frame_idx, frame in enumerate(image0):
        gt = np.zeros(shape[1:], dtype=np.uint8)
        start = frame_idx * STRIDE
        end = (frame_idx + 1) * STRIDE
        if end <= WIDTH:
          gt[:, start:end, :] = 255
        else:
          start -= WIDTH
          end -= WIDTH
          gt[start:end, :, :] = 255

        self.assertAllClose(frame, gt)

        bad_channels = image_ops.decode_image(gif0, channels=1)
        with self.assertRaises(errors_impl.InvalidArgumentError):
          bad_channels.eval()
Пример #4
0
 def get_unprocessed_data(self, how_many, model_settings, mode):
   """Gets sample data without transformations."""
   candidates = self.data_index[mode]
   if how_many == -1:
     sample_count = len(candidates)
   else:
     sample_count = how_many
   desired_samples = model_settings['desired_samples']
   words_list = self.words_list
   data = np.zeros((sample_count, desired_samples))
   labels = []
   with tf.Session(graph=tf.Graph()) as sess:
     wav_filename_placeholder = tf.placeholder(tf.string, [], name='filename')
     wav_loader = io_ops.read_file(wav_filename_placeholder)
     wav_decoder = contrib_audio.decode_wav(
         wav_loader, desired_channels=1, desired_samples=desired_samples)
     foreground_volume_placeholder = tf.placeholder(
         tf.float32, [], name='foreground_volume')
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     foreground_volume_placeholder)
     for i in range(sample_count):
       if how_many == -1:
         sample_index = i
       else:
         sample_index = np.random.randint(len(candidates))
       sample = candidates[sample_index]
       input_dict = {wav_filename_placeholder: sample['file']}
       if sample['label'] == SILENCE_LABEL:
         input_dict[foreground_volume_placeholder] = 0
       else:
         input_dict[foreground_volume_placeholder] = 1
       data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
       label_index = self.word_to_index[sample['label']]
       labels.append(words_list[label_index])
   return data, labels
Пример #5
0
  def prepare_background_data(self):
    """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
    the sample rate of the training data, but can be much longer in duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """
    self.background_data = []
    background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME)
    if not os.path.exists(background_dir):
      return self.background_data
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
      search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME,
                                 '*.wav')
      for wav_path in gfile.Glob(search_path):
        wav_data = sess.run(
            wav_decoder,
            feed_dict={wav_filename_placeholder: wav_path}).audio.flatten()
        self.background_data.append(wav_data)
      if not self.background_data:
        raise Exception('No background wav files were found in ' + search_path)
 def _restore_op(self, iterator_resource):
   iterator_state_variant = parsing_ops.parse_tensor(
       io_ops.read_file(self._iterator_checkpoint_prefix_local()),
       dtypes.variant)
   restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
                                                     iterator_state_variant)
   return restore_op
Пример #7
0
  def testGif(self):
    # Read some real GIFs
    path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
    width = 20
    height = 40
    stride = 5
    shape = (12, height, width, 3)

    with self.session(use_gpu=True) as sess:
      gif0 = io_ops.read_file(path)
      image0 = image_ops.decode_image(gif0)
      image1 = image_ops.decode_gif(gif0)
      gif0, image0, image1 = self.evaluate([gif0, image0, image1])

      self.assertEqual(image0.shape, shape)
      self.assertAllEqual(image0, image1)

      for frame_idx, frame in enumerate(image0):
        gt = np.zeros(shape[1:], dtype=np.uint8)
        start = frame_idx * stride
        end = (frame_idx + 1) * stride
        if end <= width:
          gt[:, start:end, :] = 255
        else:
          start -= width
          end -= width
          gt[start:end, :, :] = 255

        self.assertAllClose(frame, gt)

        bad_channels = image_ops.decode_image(gif0, channels=1)
        with self.assertRaises(errors_impl.InvalidArgumentError):
          self.evaluate(bad_channels)
Пример #8
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
Пример #9
0
def load_wav_file(filename):
  """Loads an audio file and returns a float PCM-encoded array of samples."""
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    return sess.run(
        wav_decoder, feed_dict={
            wav_filename_placeholder: filename
        }).audio.flatten()
Пример #10
0
 def testBmp(self):
   # Read a real bmp and verify shape
   path = os.path.join(prefix_path, "bmp", "testdata", "lena.bmp")
   with self.session(use_gpu=True) as sess:
     bmp0 = io_ops.read_file(path)
     image0 = image_ops.decode_image(bmp0)
     image1 = image_ops.decode_bmp(bmp0)
     bmp0, image0, image1 = self.evaluate([bmp0, image0, image1])
     self.assertEqual(len(bmp0), 4194)
     self.assertAllEqual(image0, image1)
Пример #11
0
 def testJpeg(self):
   # Read a real jpeg and verify shape
   path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
   with self.test_session(use_gpu=True) as sess:
     jpeg0 = io_ops.read_file(path)
     image0 = image_ops.decode_image(jpeg0)
     image1 = image_ops.decode_jpeg(jpeg0)
     jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
     self.assertEqual(len(jpeg0), 3771)
     self.assertEqual(image0.shape, (256, 128, 3))
     self.assertAllEqual(image0, image1)
Пример #12
0
 def testExisting(self):
     # Read a real jpeg and verify shape
     path = "tensorflow/core/lib/jpeg/testdata/" "jpeg_merge_test1.jpg"
     with self.test_session() as sess:
         jpeg0 = io_ops.read_file(path)
         image0 = image_ops.decode_jpeg(jpeg0)
         image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
         jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
         self.assertEqual(len(jpeg0), 3771)
         self.assertEqual(image0.shape, (256, 128, 3))
         self.assertLess(self.averageError(image0, image1), 0.8)
Пример #13
0
 def testReadFile(self):
   cases = ['', 'Some contents', 'Неки садржаји на српском']
   for contents in cases:
     contents = compat.as_bytes(contents)
     with tempfile.NamedTemporaryFile(
         prefix='ReadFileTest', dir=self.get_temp_dir(), delete=False) as temp:
       temp.write(contents)
     with self.cached_session():
       read = io_ops.read_file(temp.name)
       self.assertEqual([], read.get_shape())
       self.assertEqual(read.eval(), contents)
     os.remove(temp.name)
Пример #14
0
 def testPng(self):
   # Read some real PNGs, converting to different channel numbers
   inputs = [(1, "lena_gray.png")]
   for channels_in, filename in inputs:
     for channels in 0, 1, 3, 4:
       with self.cached_session(use_gpu=True) as sess:
         path = os.path.join(prefix_path, "png", "testdata", filename)
         png0 = io_ops.read_file(path)
         image0 = image_ops.decode_image(png0, channels=channels)
         image1 = image_ops.decode_png(png0, channels=channels)
         png0, image0, image1 = self.evaluate([png0, image0, image1])
         self.assertEqual(image0.shape, (26, 51, channels or channels_in))
         self.assertAllEqual(image0, image1)
Пример #15
0
 def prepare_processing_graph(self, model_settings):
   """Builds a TensorFlow graph to apply the input distortions"""
   desired_samples = model_settings['desired_samples']
   self.wav_filename_placeholder_ = tf.placeholder(
       tf.string, [], name='filename')
   wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
   wav_decoder = contrib_audio.decode_wav(
       wav_loader, desired_channels=1, desired_samples=desired_samples)
   # Allow the audio sample's volume to be adjusted.
   self.foreground_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='foreground_volme')
   scaled_foreground = tf.multiply(wav_decoder.audio,
                                   self.foreground_volume_placeholder_)
   # Shift the sample's start position, and pad any gaps with zeros.
   self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift')
   shifted_foreground = tf_roll(scaled_foreground,
                                self.time_shift_placeholder_)
   # Mix in background noise.
   self.background_data_placeholder_ = tf.placeholder(
       tf.float32, [desired_samples, 1], name='background_data')
   self.background_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='background_volume')
   background_mul = tf.multiply(self.background_data_placeholder_,
                                self.background_volume_placeholder_)
   background_add = tf.add(background_mul, shifted_foreground)
   # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
   self.background_clamp_ = background_add
   self.background_clamp_ = tf.reshape(self.background_clamp_,
                                       (1, model_settings['desired_samples']))
   # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
   stfts = tf.contrib.signal.stft(
       self.background_clamp_,
       frame_length=model_settings['window_size_samples'],
       frame_step=model_settings['window_stride_samples'],
       fft_length=None)
   self.spectrogram_ = tf.abs(stfts)
   num_spectrogram_bins = self.spectrogram_.shape[-1].value
   lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
   linear_to_mel_weight_matrix = \
       tf.contrib.signal.linear_to_mel_weight_matrix(
           model_settings['dct_coefficient_count'],
           num_spectrogram_bins, model_settings['sample_rate'],
           lower_edge_hertz, upper_edge_hertz)
   mel_spectrograms = tf.tensordot(self.spectrogram_,
                                   linear_to_mel_weight_matrix, 1)
   mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
       linear_to_mel_weight_matrix.shape[-1:]))
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
   self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
       log_mel_spectrograms)[:, :, :
                             model_settings['num_log_mel_features']]  # :13
Пример #16
0
 def testExisting(self):
   # Read some real PNGs, converting to different channel numbers
   prefix = 'tensorflow/core/lib/png/testdata/'
   inputs = (1, 'lena_gray.png'), (4, 'lena_rgba.png')
   for channels_in, filename in inputs:
     for channels in 0, 1, 3, 4:
       with self.test_session() as sess:
         png0 = io_ops.read_file(prefix + filename)
         image0 = image_ops.decode_png(png0, channels=channels)
         png0, image0 = sess.run([png0, image0])
         self.assertEqual(image0.shape, (26, 51, channels or channels_in))
         if channels == channels_in:
           image1 = image_ops.decode_png(image_ops.encode_png(image0))
           self.assertAllEqual(image0, image1.eval())
Пример #17
0
  def testJpeg(self):
    # Read a real jpeg and verify shape
    path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
    with self.session(use_gpu=True) as sess:
      jpeg0 = io_ops.read_file(path)
      image0 = image_ops.decode_image(jpeg0)
      image1 = image_ops.decode_jpeg(jpeg0)
      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
      self.assertEqual(len(jpeg0), 3771)
      self.assertEqual(image0.shape, (256, 128, 3))
      self.assertAllEqual(image0, image1)

      bad_channels = image_ops.decode_image(jpeg0, channels=4)
      with self.assertRaises(errors_impl.InvalidArgumentError):
        self.evaluate(bad_channels)
Пример #18
0
def load_wav_file(filename):
  """Loads an audio file and returns a float PCM-encoded array of samples.

  Args:
    filename: Path to the .wav file to load.

  Returns:
    Numpy array holding the sample data as floats between -1.0 and 1.0.
  """
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    return sess.run(
        wav_decoder,
        feed_dict={wav_filename_placeholder: filename}).audio.flatten()
Пример #19
0
  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
    words_list = self.words_list
    data = np.zeros((sample_count, desired_samples))
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
    return data, labels
Пример #20
0
 def prepare_background_data(self):
   """Searches a folder for background noise audio, and loads it into memory"""
   self.background_data = []
   background_dir = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME)
   if not os.path.exists(background_dir):
     return self.background_data
   with tf.Session(graph=tf.Graph()) as sess:
     wav_filename_placeholder = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(wav_filename_placeholder)
     wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
     search_path = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME,
                                '*.wav')
     for wav_path in gfile.Glob(search_path):
       wav_data = sess.run(
           wav_decoder, feed_dict={
               wav_filename_placeholder: wav_path
           }).audio.flatten()
       self.background_data.append(wav_data)
     if not self.background_data:
       raise Exception('No background wav files were found in ' + search_path)
Пример #21
0
  def Solve(self, input_image_path, checkpoint_path):
    raw_images = tf.image.decode_jpeg(
        io_ops.read_file(input_image_path),
        channels=1)
    resized_images = tf.image.resize_images(
        raw_images,
        HEIGHT, WIDTH,
        method=tf.image.ResizeMethod.AREA)
    flattened_images = tf.reshape(resized_images, [1, HEIGHT * WIDTH])
    normalized_images = (255.0 - tf.to_float(flattened_images)) / 255.0
    with tf.Session() as sess:
      normalized_images = sess.run(normalized_images)
    DisplaySketch(normalized_images[0])

    inputs = {self.x: normalized_images}
    if hasattr(self, 'keep_prob'):
      inputs[self.keep_prob] = 1.0  # Why 1.0?
    with tf.Session() as sess:
      saver = tf.train.Saver()
      saver.restore(sess, checkpoint_path)
      digits = sess.run(self.recognize_digits, feed_dict=inputs)
      assert len(digits) == 1
      return digits[0]
from tensorflow.python.ops import io_ops
import tensorflow as tf
import numpy as np

if len(sys.argv) < 3:
    raise ValueError("give me a path to model and to a file float32 .dat")
else:
    model_path = sys.argv[1]
    file_path = sys.argv[2]

FRAME_SIZE = 640
FRAME_STRIDE = 320
SAMPLE_RATE = DESIRED_SAMPLES = 16000
NUM_CEP = 10

wav_loader = io_ops.read_file(file_path)
wav_decoder = audio_ops.decode_wav(wav_loader,
                                   desired_channels=1,
                                   desired_samples=DESIRED_SAMPLES)
# Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
spectrograms_power = audio_ops.audio_spectrogram(wav_decoder.audio,
                                                 window_size=FRAME_SIZE,
                                                 stride=FRAME_STRIDE,
                                                 magnitude_squared=True)
USE_POWER = True
if USE_POWER:
    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = spectrograms_power.shape[-1].value
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
model = load_model('checkpoints_186/ep-053-vl-0.2915.hdf5',
                   custom_objects={'relu6': relu6,
                                   'DepthwiseConv2D': DepthwiseConv2D,
                                   'overlapping_time_slice_stack':
                                   overlapping_time_slice_stack,
                                   'softmax': softmax,
                                   '<lambda>':
                                   smooth_categorical_crossentropy})

# rename placeholders for special prize:
# https://www.kaggle.com/c/tensorflow-speech-recognition-challenge#Prizes
# decoded_sample_data:0, taking a [16000, 1] float tensor as input,
# representing the audio PCM-encoded data.
wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='wav_fn')
wav_loader = io_ops.read_file(wav_filename_placeholder_)
wav_decoder = contrib_audio.decode_wav(
    wav_loader, desired_channels=1, desired_samples=16000,
    name='decoded_sample_data')
# add batch dimension and remove last one
# keras model wants (None, 16000)
data_reshaped = tf.reshape(wav_decoder.audio, (1, -1))
# call keras model
all_probs = model(data_reshaped)
# remove batch dimension
all_probs = tf.reshape(all_probs, (-1, ))
# map classes to 12 wanted classes:
# 'silence unknown', 'stop down off right up go on yes left no'
# models were trained with 32 classes (including the known unknowns):
# 'silence unknown', 'sheila nine stop bed four six down bird marvin cat off right seven eight up three happy go zero on wow dog yes five one tree house two left no'  # noqa
# Note: This is NOT simply summing up the probabilities for
Пример #24
0
 def testInvalidUTF8ProducesReasonableError(self):
   if sys.version_info[0] < 3:
     self.skipTest("Test is only valid in python3.")
   with self.assertRaises(UnicodeDecodeError):
     io_ops.read_file(b"\xff")
 def _restore_op(self, iterator_resource):
   iterator_state_variant = parsing_ops.parse_tensor(
       io_ops.read_file(self._iterator_checkpoint_prefix()), dtypes.variant)
   restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
                                                     iterator_state_variant)
   return restore_op
Пример #26
0
  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Retrieve sample data for the given partition, with no transformations.
	检索给定集合的样本,不进行加噪处理
    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
    candidates = self.data_index[mode]
	# self.data_index = {'validation': [], 'testing': [], 'training': []}
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
	# desired_samples =16000
    words_list = self.words_list
	# words_list =['_silence_','_unknown_','yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go]
    data = np.zeros((sample_count, desired_samples))
	# data 是sample_count行,16000列的全0矩阵
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
	  # 前景音音量foreground_volume
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
	# 调整过音量的前景声音:scaled_foreground,是由一串数字乘以一个数字得到的
	# wav_decoder 是由decode_wav这个函数得到的,可能是一个dict结构,
	# 有一个key为audio,value为一串-1到1之间的数字,代表这段声音
	# 其它的key可能是声道channels啊什么的,因为看不懂decode_wav的返回值,纯猜测的
	
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
		# 上面的意思是在集合中随机挑选样本
		# 下面开始完善这个函数中的input_dict
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
		# sample 应该是个dict结构,key有file,label之类
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
		  
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
		# 上面这句话的意思应该是对data的第i行进行前景音量的调节,最后flatten
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
		# words_list =['_silence_','_unknown_','yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go]
    return data, labels
Пример #27
0
def path_to_string_content(path, max_length):
    txt = io_ops.read_file(path)
    if max_length is not None:
        txt = string_ops.substr(txt, 0, max_length)
    return txt
Пример #28
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file,加载一段音频, decodes it解码, scales the volume调整音量的大小,
    shifts it in time, adds in background noise加入背景噪音, calculates a spectrogram计算出一个频谱图, and
    then builds an MFCC fingerprint from that,然后计算出MFCC值.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.
	  
	返回值应该有下面三个,但最主要的还是self.output_,它代表了最终输入神经网络的特征
	self.output_ 
	self.merged_summaries_
	self.summary_writer_ 

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
		  '''
			decode_wav函数的说明是这个样子的:
			def decode_wav(contents, desired_channels=-1, desired_samples=-1, name=None):
			
			Decode a 16-bit PCM WAV file to a float tensor.
			The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
			Args:
			contents: A `Tensor` of type `string`.
					The WAV-encoded audio, usually from a file.
			desired_channels: An optional `int`. Defaults to `-1`.
					Number of sample channels wanted.
			desired_samples: An optional `int`. Defaults to `-1`.
					Length of audio requested.
			name: A name for the operation (optional).

			Returns:
			A tuple of `Tensor` objects (audio, sample_rate).

			audio: A `Tensor` of type `float32`.
			sample_rate: A `Tensor` of type `int32`
			
			这说明wav_decoder是一组-1到1之间的float32型的数字,是一个tensor,表示一段wav
		  '''
		  
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
		'''
Пример #29
0
    def prepare_processing_graph(self, flags):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = flags.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)

            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])
            wav_resampled = resample(wav_decoder.audio,
                                     self.foreground_resampling_placeholder_,
                                     desired_samples)
            scaled_foreground = tf.multiply(
                wav_resampled, self.foreground_volume_placeholder_)

            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            sliced_foreground, padded_foreground = shift_in_time(
                scaled_foreground, self.time_shift_padding_placeholder_,
                self.time_shift_offset_placeholder_, desired_samples)

            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            # if flags.preprocess == 'raw':
            # background_clamp dims: [time, channels]
            # remove channel dim
            self.output_ = tf.squeeze(background_clamp, axis=1)
Пример #30
0
    def prepare_processing_graph(self, flags):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = flags.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)

            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if flags.preprocess == 'raw':
                # background_clamp dims: [time, channels]
                # remove channel dim
                self.output_ = tf.squeeze(background_clamp, axis=1)
            # below options are for backward compatibility with previous
            # version of hotword detection on microcontrollers
            # in this case audio feature extraction is done separately from
            # neural net and user will have to manage it.
            elif flags.preprocess == 'mfcc':
                # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
                # background_clamp dims: [time, channels]
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=flags.window_size_samples,
                    stride=flags.window_stride_samples,
                    magnitude_squared=flags.fft_magnitude_squared)
                # spectrogram: [channels/batch, frames, fft_feature]

                # extract mfcc features from spectrogram by audio_ops.mfcc:
                # 1 Input is spectrogram frames.
                # 2 Weighted spectrogram into bands using a triangular mel filterbank
                # 3 Logarithmic scaling
                # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
                mfcc = audio_ops.mfcc(
                    spectrogram=spectrogram,
                    sample_rate=flags.sample_rate,
                    upper_frequency_limit=flags.mel_upper_edge_hertz,
                    lower_frequency_limit=flags.mel_lower_edge_hertz,
                    filterbank_channel_count=flags.mel_num_bins,
                    dct_coefficient_count=flags.dct_num_features)
                # mfcc: [channels/batch, frames, dct_coefficient_count]
                # remove channel dim
                self.output_ = tf.squeeze(mfcc, axis=0)
            elif flags.preprocess == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                int16_input = tf.cast(
                    tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16)
                # audio_microfrontend does:
                # 1. A slicing window function of raw audio
                # 2. Short-time FFTs
                # 3. Filterbank calculations
                # 4. Noise reduction
                # 5. PCAN Auto Gain Control
                # 6. Logarithmic scaling

                # int16_input dims: [time, channels]
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=flags.sample_rate,
                    window_size=flags.window_size_ms,
                    window_step=flags.window_stride_ms,
                    num_channels=flags.mel_num_bins,
                    upper_band_limit=flags.mel_upper_edge_hertz,
                    lower_band_limit=flags.mel_lower_edge_hertz,
                    out_scale=1,
                    out_type=tf.float32)
                # int16_input dims: [frames, num_channels]
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "raw", '
                    ' "mfcc", or "micro")' % (flags.preprocess))
Пример #31
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrograms_power = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if USE_POWER:
      # Warp the linear scale spectrograms into the mel-scale.
      num_spectrogram_bins = spectrograms_power.shape[-1].value
      lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
      linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
        upper_edge_hertz)
      mel_spectrograms = tf.tensordot(
        spectrograms_power, linear_to_mel_weight_matrix, 1)
      mel_spectrograms.set_shape(spectrograms_power.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

      # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
      log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

      # Compute MFCCs from log_mel_spectrograms and take the first NDCT.
      mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
        log_mel_spectrograms)[..., :model_settings['dct_coefficient_count']]
      self.mfcc_ = tf.expand_dims(mfccs, axis=0)
    else:
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])

  def set_size(self, mode):
    """Calculates the number of samples in the dataset partition.

    Args:
      mode: Which partition, must be 'training', 'validation', or 'testing'.

    Returns:
      Number of samples in the partition.
    """
    return len(self.data_index[mode])

  def get_data(self, how_many, offset, model_settings, background_frequency,
               background_volume_range, time_shift, mode, sess):
    """Gather samples from the data set, applying transformations as needed.

    When the mode is 'training', a random selection of samples will be returned,
    otherwise the first N clips in the partition will be used. This ensures that
    validation always uses the same samples, reducing noise in the metrics.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      offset: Where to start when fetching deterministically.
      model_settings: Information about the current model being trained.
      background_frequency: How many clips will have background noise, 0.0 to
        1.0.
      background_volume_range: How loud the background noise will be.
      time_shift: How much to randomly shift the clips by in time.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.
      sess: TensorFlow session that was active when processor was created.

    Returns:
      List of sample data for the transformed samples, and list of labels in
      one-hot form.
    """
    # Pick one of the partitions to choose samples from.
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = max(0, min(how_many, len(candidates) - offset))
    # Data and labels will be populated and returned.
    data = np.zeros((sample_count, model_settings['fingerprint_size']))
    labels = np.zeros((sample_count, model_settings['label_count']))
    desired_samples = model_settings['desired_samples']
    use_background = self.background_data and (mode == 'training')
    pick_deterministically = (mode != 'training')
    # Use the processing graph we created earlier to repeatedly to generate the
    # final output sample data we'll use in training.
    for i in xrange(offset, offset + sample_count):
      # Pick which audio sample to use.
      if how_many == -1 or pick_deterministically:
        sample_index = i
      else:
        sample_index = np.random.randint(len(candidates))
      sample = candidates[sample_index]
      # If we're time shifting, set up the offset for this sample.
      if time_shift > 0:
        time_shift_amount = np.random.randint(-time_shift, time_shift)
      else:
        time_shift_amount = 0
      if time_shift_amount > 0:
        time_shift_padding = [[time_shift_amount, 0], [0, 0]]
        time_shift_offset = [0, 0]
      else:
        time_shift_padding = [[0, -time_shift_amount], [0, 0]]
        time_shift_offset = [-time_shift_amount, 0]
      input_dict = {
          self.wav_filename_placeholder_: sample['file'],
          self.time_shift_padding_placeholder_: time_shift_padding,
          self.time_shift_offset_placeholder_: time_shift_offset,
      }
      # Choose a section of background noise to mix in.
      if use_background:
        background_index = np.random.randint(len(self.background_data))
        background_samples = self.background_data[background_index]
        background_offset = np.random.randint(
            0, len(background_samples) - model_settings['desired_samples'])
        background_clipped = background_samples[background_offset:(
            background_offset + desired_samples)]
        background_reshaped = background_clipped.reshape([desired_samples, 1])
        if np.random.uniform(0, 1) < background_frequency:
          background_volume = np.random.uniform(0, background_volume_range)
        else:
          background_volume = 0
      else:
        background_reshaped = np.zeros([desired_samples, 1])
        background_volume = 0
      input_dict[self.background_data_placeholder_] = background_reshaped
      input_dict[self.background_volume_placeholder_] = background_volume
      # If we want silence, mute out the main sample but leave the background.
      if sample['label'] == SILENCE_LABEL:
        input_dict[self.foreground_volume_placeholder_] = 0
      else:
        input_dict[self.foreground_volume_placeholder_] = 1
      # Run the graph to produce the output audio.
      data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten()
      label_index = self.word_to_index[sample['label']]
      labels[i - offset, label_index] = 1
    return data, labels

  def get_wav_files(self, how_many, offset, model_settings, mode):
    """Return wav_file names and labels from train/val/test sets.
    """
    # Pick one of the partitions to choose samples from.
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = max(0, min(how_many, len(candidates) - offset))
    pick_deterministically = (mode != 'training')
    wav_files = []
    labels = np.zeros((sample_count, model_settings['label_count']))
    for i in xrange(offset, offset + sample_count):
      # Pick which audio sample to use.
      if how_many == -1 or pick_deterministically:
        sample_index = i
      else:
        sample_index = np.random.randint(len(candidates))
      sample = candidates[sample_index]
      if sample['label'] == SILENCE_LABEL:
        wav_files.append('silence.wav')
      else:
        wav_files.append(sample['file'])
      label_index = self.word_to_index[sample['label']]
      labels[i - offset, label_index] = 1
    return wav_files, labels


  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
    words_list = self.words_list
    data = np.zeros((sample_count, desired_samples))
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = io_ops.read_file(wav_filename_placeholder)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(tf.float32, [])
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
    return data, labels
Пример #32
0
  def _evalDecodeJpeg(self,
                      image_name,
                      parallelism,
                      num_iters,
                      crop_during_decode=None,
                      crop_window=None,
                      tile=None):
    """Evaluate DecodeJpegOp for the given image.

    TODO(tanmingxing): add decoding+cropping as well.

    Args:
      image_name: a string of image file name (without suffix).
      parallelism: the number of concurrent decode_jpeg ops to be run.
      num_iters: number of iterations for evaluation.
      crop_during_decode: If true, use fused DecodeAndCropJpeg instead of
          separate decode and crop ops. It is ignored if crop_window is None.
      crop_window: if not None, crop the decoded image. Depending on
          crop_during_decode, cropping could happen during or after decoding.
      tile: if not None, tile the image to composite a larger fake image.

    Returns:
      The duration of the run in seconds.
    """
    ops.reset_default_graph()

    image_file_path = os.path.join(prefix_path, image_name)

    if tile is None:
      image_content = variable_scope.get_variable(
          'image_%s' % image_name,
          initializer=io_ops.read_file(image_file_path))
    else:
      single_image = image_ops.decode_jpeg(
          io_ops.read_file(image_file_path), channels=3, name='single_image')
      # Tile the image to composite a new larger image.
      tiled_image = array_ops.tile(single_image, tile)
      image_content = variable_scope.get_variable(
          'tiled_image_%s' % image_name,
          initializer=image_ops.encode_jpeg(tiled_image))

    with session.Session() as sess:
      self.evaluate(variables.global_variables_initializer())
      images = []
      for _ in xrange(parallelism):
        if crop_window is None:
          # No crop.
          image = image_ops.decode_jpeg(image_content, channels=3)
        elif crop_during_decode:
          # combined decode and crop.
          image = image_ops.decode_and_crop_jpeg(
              image_content, crop_window, channels=3)
        else:
          # separate decode and crop.
          image = image_ops.decode_jpeg(image_content, channels=3)
          image = image_ops.crop_to_bounding_box(
              image,
              offset_height=crop_window[0],
              offset_width=crop_window[1],
              target_height=crop_window[2],
              target_width=crop_window[3])

        images.append(image)
      r = control_flow_ops.group(*images)

      for _ in xrange(3):
        # Skip warm up time.
        self.evaluate(r)

      start_time = time.time()
      for _ in xrange(num_iters):
        self.evaluate(r)
      end_time = time.time()
    return end_time - start_time
Пример #33
0
def path_to_image(path, image_size, num_channels, interpolation):
  img = io_ops.read_file(path)
  img = image_ops.decode_image(
      img, channels=num_channels, expand_animations=False)
  return image_ops.resize_images_v2(img, image_size, method=interpolation)
Пример #34
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        self.spectrogram = spectrogram

        self.mfcc_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
        print(self.mfcc_)
Пример #35
0
 def read_file(self):
     return io_ops.read_file(self.asset)
Пример #36
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """
    建立张量流图以应用输入失真。
    创建一个图形,加载一个WAVE文件,对其进行解码、缩放体积、平移,
    添加背景噪声,计算一个声谱图,然后从中生成MFCC特征。
    必须在TensorFlow会话运行时调用它,它会创建多个占位符输入和一个输出::

      - wav_filename_placeholder_: 音频文件名
      - foreground_volume_placeholder_: 主剪辑的声音应该有多大
      - time_shift_padding_placeholder_: 在哪个位置剪辑
      - time_shift_offset_placeholder_: 在剪辑上移动多少
      - background_data_placeholder_: 背景噪声的PCM采样数据
      - background_volume_placeholder_: 背景中混音的响度
      - output_: 经过处理后的二维输出

    Args:
      model_settings: 正在训练的当前模型信息
      summaries_dir: 保存训练摘要信息的路径
      
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)

            #允许调整音频样本的音量

            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)

            # 移动样本的起始位置,并用零填充任何间隙

            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(scaled_foreground,
                                       self.time_shift_padding_placeholder_,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # 混入背景噪音
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            # 运行频谱图和MFCC节点来获取音频的二维特征

            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)

            #频谱图中每个FFT行中的桶数将取决于每个窗口中有多少输入样本。
            #不需要详细分类,希望缩小它们以产生更小的结果。
            #一种方法是使用平均法来遍历相邻的bucket,更复杂的方法是应用MFCC算法来缩小表示。

            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (model_settings['preprocess']))

            # 合并所有摘要并将其写入/tmp/retrain_日志

            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            self.summary_writer_ = tf.summary.FileWriter(
                summaries_dir + '/data', tf.get_default_graph())
Пример #37
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.
        Creates a graph that loads a WAVE file, decodes it, scales the volume,
        shifts it in time, adds in background noise, calculates a spectrogram, and
        then builds an MFCC fingerprint from that.
        This must be called with an active TensorFlow session running, and it
        creates multiple placeholder inputs, and one output:
          - wav_filename_placeholder_: Filename of the WAV to load.
          - foreground_volume_placeholder_: How loud the main clip should be.
          - time_shift_padding_placeholder_: Where to pad the clip.
          - time_shift_offset_placeholder_: How much to move the clip in time.
          - background_data_placeholder_: PCM sample data for background noise.
          - background_volume_placeholder_: Loudness of mixed-in background.
          - mfcc_: Output 2D fingerprint of processed audio.
        Args:
          model_settings: Information about the current model being trained.
        """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(
            wav_loader, desired_channels=1, desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(
            scaled_foreground,
            self.time_shift_padding_placeholder_,
            mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                           [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.

#         mel_bias_ = linear_to_mel_weight_matrix(num_mel_bins=model_settings['dct_coefficient_count'],
#             num_spectrogram_bins=int(2048/2+1),
#             sample_rate=model_settings['sample_rate'],
#             lower_edge_hertz=100,
#             upper_edge_hertz=4800)
            #warp_factor=self.warp_factor_placeholder_)

        # spectrogram = tf.abs(tf.contrib.signal.stft(tf.transpose(background_clamp),
        #     model_settings['window_size_samples'],
        #     model_settings['window_stride_samples'],
        #     fft_length=2048,
        #     window_fn=tf.contrib.signal.hann_window,
        #     pad_end=False))

        # self.mfcc_ = tf.matmul(tf.reshape(tf.pow(spectrogram, 2), [-1, 1025]), mel_bias_)
        # #self.mfcc_ = tf.maximum(self.mfcc_, 1e-7)
        # self.mfcc_ = tf.log(tf.maximum(self.mfcc_, 1e-7))

        # print('/n New feature without DCT and Log by iVip-Tsinghua /n hahahahahahahaha /n')

        spectrogram = contrib_audio.audio_spectrogram(
           background_clamp,
           window_size=model_settings['window_size_samples'],
           stride=model_settings['window_stride_samples'],
           magnitude_squared=True)
        self.mfcc_ = contrib_audio.mfcc(
           spectrogram,
           wav_decoder.sample_rate,
           dct_coefficient_count=model_settings['dct_coefficient_count'])
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrograms_power = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if model_settings['use_power']:
      # Warp the linear scale spectrograms into the mel-scale.
      num_spectrogram_bins = spectrograms_power.shape[-1].value
      lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
      linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
        upper_edge_hertz)
      mel_spectrograms = tf.tensordot(
        spectrograms_power, linear_to_mel_weight_matrix, 1)
      mel_spectrograms.set_shape(spectrograms_power.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

      # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
      log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

      # Compute MFCCs from log_mel_spectrograms and take the first NDCT.
      mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
        log_mel_spectrograms)[..., :model_settings['dct_coefficient_count']]
      self.mfcc_ = tf.expand_dims(mfccs, axis=0)

    else:
      self.mfcc_ = contrib_audio.mfcc(
         spectrograms_power,
         wav_decoder.sample_rate,
         dct_coefficient_count=model_settings['dct_coefficient_count'])
Пример #39
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.compat.v1.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.

            # spectrogram = audio_ops.audio_spectrogram(
            #     background_clamp,
            #     window_size=model_settings['window_size_samples'],
            #     stride=model_settings['window_stride_samples'],
            #     magnitude_squared=True)

            def periodic_hann_window(window_length, dtype):
                return 0.5 - 0.5 * tf.math.cos(2.0 * np.pi * tf.range(
                    tf.cast(window_length, dtype=dtype),
                    dtype=dtype) / tf.cast(window_length, dtype=dtype))

            signal_stft = tf.signal.stft(
                tf.transpose(background_clamp, [1, 0]),
                frame_length=model_settings['window_size_samples'],
                frame_step=model_settings['window_stride_samples'],
                window_fn=periodic_hann_window)
            signal_spectrograms = tf.abs(signal_stft)
            spectrogram = signal_spectrograms

            tf.compat.v1.summary.image('spectrogram',
                                       tf.expand_dims(spectrogram, -1),
                                       max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.compat.v1.summary.image('shrunk_spectrogram',
                                           self.output_,
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'fbank':
                # We just convert the data back to int16 wav format
                # and the actual filterbank processing is performed outside of tensorflow graph
                # in the get_data function
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                # def compute_fbs(int16_wav_input):
                #     fbs, energy = fbank(int16_wav_input, model_settings['sample_rate'],
                #                         nfilt=model_settings['fingerprint_width'],
                #                         winstep=model_settings['window_stride_samples'] / model_settings['sample_rate'],
                #                         winlen=model_settings['window_size_samples'] / model_settings['sample_rate'],
                #                         nfft=1024,
                #                         lowfreq=64)
                #     fbs = np.log(fbs)
                #     energy = np.log(energy)
                #     return np.concatenate([fbs, energy[:, None]], axis=1)
                #
                # log_fbs_with_energy = compute_fbs(int16_input)
                self.output_ = int16_input
                # tf.compat.v1.summary.image(
                #     'fbank', tf.expand_dims(self.output_, -1), max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':

                # signal_mfccs = audio_ops.mfcc(
                #     spectrogram,
                #     # tf.expand_dims(signal_spectrograms, 0),
                #     wav_decoder.sample_rate,
                #     dct_coefficient_count=model_settings['fingerprint_width'])
                #
                # self.output_ = signal_mfccs
                # print("OLD", signal_mfccs.shape)

                num_spectrogram_bins = signal_stft.shape[-1]

                num_mel_bins = num_mfccs = model_settings['fingerprint_width']
                lower_edge_hertz = 20.0
                upper_edge_hertz = 4000.0
                log_noise_floor = 1e-12
                linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
                    num_mel_bins,
                    num_spectrogram_bins,
                    model_settings['sample_rate'],
                    # lower_edge_hertz, upper_edge_hertz
                )
                mel_spectrograms = tf.tensordot(spectrogram,
                                                linear_to_mel_weight_matrix, 1)
                mel_spectrograms.set_shape(
                    mel_spectrograms.shape[:-1].concatenate(
                        linear_to_mel_weight_matrix.shape[-1:]))

                log_mel_spectrograms = tf.math.log(mel_spectrograms +
                                                   log_noise_floor)
                signal_mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
                    log_mel_spectrograms)[..., :num_mfccs]
                # print("NEW", signal_mfccs.shape)

                self.output_ = signal_mfccs

                tf.compat.v1.summary.image('mfcc',
                                           tf.expand_dims(self.output_, -1),
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                tf.compat.v1.summary.image(
                    'micro',
                    tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
                    max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.compat.v1.summary.merge_all(
                scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.compat.v1.summary.FileWriter(
                    summaries_dir + '/data', tf.compat.v1.get_default_graph())
Пример #40
0
    def prepare_processing_graph(self, model_settings, summaries_dir):

        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)

            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)

            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(scaled_foreground,
                                       self.time_shift_padding_placeholder_,
                                       mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])

            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)

            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (model_settings['preprocess']))

            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            self.summary_writer_ = tf.summary.FileWriter(
                summaries_dir + '/data', tf.get_default_graph())
Пример #41
0
Файл: id.py Проект: cooledge/nn
def load_wav_file(sess, filename):
  filename_ph = tf.placeholder(tf.string)
  loader = io_ops.read_file(filename_ph)
  decoder = contrib_audio.decode_wav(loader, desired_channels=1)
  return sess.run(decoder, feed_dict={filename_ph: filename})
Пример #42
0
  def prepare_processing_graph(self, model_settings, input_type,spectrogram_magntide_squared):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)        # Noise is added to clean speech signal
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    if(spectrogram_magntide_squared):
      print("Using power spectrogram")
    else:
      print("Using magnitude spectrogram")
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=spectrogram_magntide_squared)
    if (input_type == 'log-mel'):
      print("log-mel energies")
      # Warp the linear-scale, magnitude spectrograms into the mel-scale.
      num_spectrogram_bins = spectrogram.shape[-1].value #magnitude_spectrograms.shape[-1].value
      lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings['dct_coefficient_count']
      linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz,
        upper_edge_hertz)
      mel_spectrograms = tf.tensordot(
        spectrogram, linear_to_mel_weight_matrix, 1)
      # Note: Shape inference for `tf.tensordot` does not currently handle this case.
      mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))
      log_offset = 1e-6
      log_mel_spectrograms = tf.log(mel_spectrograms + log_offset)
      self.mfcc_ = log_mel_spectrograms
    elif(input_type == 'MFCC'):
      print('MFCC-features')
      self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
Пример #43
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
                         ' "average")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                   tf.get_default_graph())
Пример #44
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.compat.v1.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.compat.v1.summary.image('spectrogram',
                                       tf.expand_dims(spectrogram, -1),
                                       max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.compat.v1.summary.image('shrunk_spectrogram',
                                           self.output_,
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = contrib_audio.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.compat.v1.summary.image('mfcc',
                                           tf.expand_dims(self.output_, -1),
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                tf.compat.v1.summary.image(
                    'micro',
                    tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
                    max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.compat.v1.summary.merge_all(
                scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.compat.v1.summary.FileWriter(
                    summaries_dir + '/data', tf.compat.v1.get_default_graph())
Пример #45
0
def wav_to_spectogram(wav, mode="original", plot=False):
    """
    Convert a wav file to a spectrogram and mfcc picture

    param wav (string):     path with name to wav file
    param mode (string):    "original" to display as used by tf code, or "enhanced"
                            for a visualization more human readable
    param plot (bool):      to plot or not
    """

    if mode == "original":
        model_settings = {
            'dct_coefficient_count': 40,
            'window_size_samples': 480,
            'label_count': 12,
            'desired_samples': 16000,
            'window_stride_samples': 160,
            'spectrogram_length': 98,
            'sample_rate': 16000,
            'fingerprint_size': 3920
        }
    else:
        # settings from wav_to_spectrogram script from tensorflow tutorial
        model_settings = {
            'dct_coefficient_count': 40,
            'window_size_samples': 256,
            'label_count': 12,
            'desired_samples': 16000,
            'window_stride_samples': 128,
            'spectrogram_length': 98,
            'sample_rate': 16000,
            'fingerprint_size': 3920
        }

    # load file
    desired_samples = model_settings['desired_samples']
    wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)

    # required placeholders for things we don't really use, here no values are set yet,
    # they are just placeholders
    foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    background_data_placeholder_ = tf.placeholder(tf.float32,
                                                  [desired_samples, 1])
    background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    foreground_volume_placeholder_)
    padded_foreground = tf.pad(scaled_foreground,
                               time_shift_padding_placeholder_,
                               mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    background_mul = tf.multiply(background_data_placeholder_,
                                 background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    mfcc = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])

    # set some paramters / settings for the spectrogram / mfcc
    input_dict = {
        wav_filename_placeholder_: wav,  # path to file we want to analyze
        time_shift_padding_placeholder_: [[0, 0], [0, 0]],
        time_shift_offset_placeholder_: [0, 0],
        background_data_placeholder_: np.zeros([desired_samples,
                                                1]),  # no background noise
        background_volume_placeholder_: 0.0,  # no background noise
        foreground_volume_placeholder_: 1.0  # don't silence the wav file
    }

    # run spectrogram and mfcc analysis, output is a numpy array
    spectrogram_data = sess.run(spectrogram, feed_dict=input_dict)
    mfcc_data = sess.run(mfcc, feed_dict=input_dict)

    spectrogram_data_plot = spectrogram_data[0]
    mfcc_data_plot = mfcc_data[0]

    # Do some extra preprocessing to make the spectrogram more easy to read_file
    # if the enhanced mode was chosen
    if mode == "enhanced":
        # normalize the array to the 0-255 range
        spectrogram_data_plot *= 255.0 / spectrogram_data_plot.max()

        # brighten it a bit
        brightness = 3  # brighten by 500%
        spectrogram_data_plot = spectrogram_data_plot * brightness

        # clip back to [0, 255] range
        spectrogram_data_plot = np.clip(spectrogram_data_plot, 0.0, 255.0)

    if plot:
        # init plots
        print("\nSpectrogram data spectrogram: %s" %
              str(np.shape(spectrogram_data[0])))
        print("MFCC data shape: %s" % str(np.shape(mfcc_data[0])))
        print("MFCC has 40 coefficients")

        input_time_size = spectrogram_data.shape[1]
        input_frequency_size = spectrogram_data.shape[2]
        fig2 = plt.figure(figsize=(8, 20))
        fig2.suptitle("Spectrogram of wav file")
        plt.xlabel('Time')
        plt.ylabel('Frequency')
        plt.imshow(np.rot90(spectrogram_data_plot), cmap='binary')
        plt.xticks([i * input_time_size / 10 for i in range(10)],
                   range(0, 1000, 100))
        plt.yticks([i * input_frequency_size / 39.8 for i in range(40)],
                   range(8000, 0, -200))

        # fig3=plt.figure(figsize=(8, 20))
        # fig3.suptitle("MFCC of wav file")
        # plt.xlabel('Time')
        # plt.ylabel('Mel Frequency Cepstrum Coefficients')
        # plt.imshow(np.rot90(mfcc_data_plot), cmap='gray')

    return spectrogram_data, mfcc_data
Пример #46
0
 def _restore_op(iterator_resource):
   iterator_state_variant = parsing_ops.parse_tensor(
       io_ops.read_file(_path()), dtypes.variant)
   restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
                                                     iterator_state_variant)
   return restore_op
Пример #47
0
 def _restore_op(iterator_resource):
   iterator_state_variant = parsing_ops.parse_tensor(
       io_ops.read_file(_path()), dtypes.variant)
   restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
                                                     iterator_state_variant)
   return restore_op
Пример #48
0
  def prepare_processing_graph(self, flags):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
    with tf.compat.v1.get_default_graph().name_scope('data'):
      desired_samples = flags.desired_samples
      self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = tf.audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [], name='foreground_volume')
      # signal resampling to generate more training data
      # it will stretch or squeeze input signal proportinally to:
      self.foreground_resampling_placeholder_ = tf.placeholder(tf.float32, [])

      if self.foreground_resampling_placeholder_ != 1.0:
        image = tf.expand_dims(wav_decoder.audio, 0)
        image = tf.expand_dims(image, 2)
        shape = tf.shape(wav_decoder.audio)
        image_resized = tf.image.resize(
            images=image,
            size=(tf.cast((tf.cast(shape[0], tf.float32) *
                           self.foreground_resampling_placeholder_),
                          tf.int32), 1),
            preserve_aspect_ratio=False)
        image_resized_cropped = tf.image.resize_with_crop_or_pad(
            image_resized,
            target_height=desired_samples,
            target_width=1,
        )
        image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3])
        scaled_foreground = tf.multiply(image_resized_cropped,
                                        self.foreground_volume_placeholder_)
      else:
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          tensor=scaled_foreground,
          paddings=self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

      if flags.preprocess == 'raw':
        # return raw audio
        self.output_ = background_clamp
        tf.compat.v1.summary.image(
            'input_audio',
            tf.expand_dims(tf.expand_dims(background_clamp, -1), -1),
            max_outputs=1)
      else:
        # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint'
        spectrogram = audio_ops.audio_spectrogram(
            background_clamp,
            window_size=flags.window_size_samples,
            stride=flags.window_stride_samples,
            magnitude_squared=True)
        tf.compat.v1.summary.image(
            'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
        # The number of buckets in each FFT row in the spectrogram will depend
        # on how many input samples there are in each window. This can be quite
        # large, with a 160 sample window producing 127 buckets for example. We
        # don't need this level of detail for classification, so we often want
        # to shrink them down to produce a smaller result. That's what this
        # section implements. One method is to use average pooling to merge
        # adjacent buckets, but a more sophisticated approach is to apply the
        # MFCC algorithm to shrink the representation.
        if flags.preprocess == 'average':
          self.output_ = tf.nn.pool(
              input=tf.expand_dims(spectrogram, -1),
              window_shape=[1, flags.average_window_width],
              strides=[1, flags.average_window_width],
              pooling_type='AVG',
              padding='SAME')
          tf.compat.v1.summary.image('shrunk_spectrogram',
                                     self.output_,
                                     max_outputs=1)
        elif flags.preprocess == 'mfcc':
          self.output_ = audio_ops.mfcc(
              spectrogram,
              wav_decoder.sample_rate,
              dct_coefficient_count=flags.fingerprint_width)
          tf.compat.v1.summary.image(
              'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
        elif flags.preprocess == 'micro':
          if not frontend_op:
            raise Exception(
                'Micro frontend op is currently not available when running'
                ' TensorFlow directly from Python, you need to build and run'
                ' through Bazel')
          sample_rate = flags.sample_rate
          window_size_ms = (flags.window_size_samples * 1000) / sample_rate
          window_step_ms = (flags.window_stride_samples * 1000) / sample_rate
          int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16)
          micro_frontend = frontend_op.audio_microfrontend(
              int16_input,
              sample_rate=sample_rate,
              window_size=window_size_ms,
              window_step=window_step_ms,
              num_channels=flags.fingerprint_width,
              out_scale=1,
              out_type=tf.float32)
          self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
          tf.compat.v1.summary.image(
              'micro',
              tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
              max_outputs=1)
        else:
          raise ValueError('Unknown preprocess mode "%s" (should be "mfcc", '
                           ' "average", or "micro")' % (flags.preprocess))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.compat.v1.summary.merge_all(scope='data')
      if flags.summaries_dir:
        self.summary_writer_ = tf.compat.v1.summary.FileWriter(
            flags.summaries_dir + '/data', tf.compat.v1.get_default_graph())
Пример #49
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])

        mel_bias_ = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins=model_settings['dct_coefficient_count'],
            num_spectrogram_bins=int(2048 / 2 + 1),
            sample_rate=model_settings['sample_rate'],
            lower_edge_hertz=125,
            upper_edge_hertz=float(model_settings['sample_rate'] / 2 - 200))
        spectrogram = tf.abs(
            tf.contrib.signal.stft(tf.transpose(sliced_foreground),
                                   model_settings['window_size_samples'],
                                   model_settings['window_stride_samples'],
                                   fft_length=2048,
                                   window_fn=tf.contrib.signal.hann_window,
                                   pad_end=False))
        S = tf.matmul(tf.reshape(tf.pow(spectrogram, 2), [-1, 1025]),
                      mel_bias_)
        log_mel_spectrograms = tf.log(tf.maximum(S, 1e-7))

        if model_settings['feature_type'] == 'fbank':
            self.mfcc_ = log_mel_spectrograms
        elif model_settings['feature_type'] == 'mfcc':
            # Compute MFCCs from log_mel_spectrograms.
            self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
                log_mel_spectrograms)
        else:
            raise ValueError("not supported feature_type: {}".format(
                model_settings['feature_type']))
Пример #50
0
 def testInvalidUTF8ProducesReasonableError(self):
     if sys.version_info[0] < 3:
         self.skipTest("Test is only valid in python3.")
     with self.assertRaises(UnicodeDecodeError):
         io_ops.read_file(b"\xff")
Пример #51
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_placeholder_: How much the clip is shifted.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [],
                                                        name='filename')
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='foreground_volme')
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_placeholder_ = tf.placeholder(tf.int32,
                                                      name='timeshift')
        # TODO(see--): Write test with np.roll
        shifted_foreground = tf_roll(scaled_foreground,
                                     self.time_shift_placeholder_)
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1], name='background_data')
        self.background_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='background_volume')
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, shifted_foreground)
        # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
        self.background_clamp_ = background_add
        self.background_clamp_ = tf.reshape(
            self.background_clamp_, (1, model_settings['desired_samples']))
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        stfts = tf.contrib.signal.stft(
            self.background_clamp_,
            frame_length=model_settings['window_size_samples'],
            frame_step=model_settings['window_stride_samples'],
            fft_length=None)
        self.spectrogram_ = tf.abs(stfts)
        num_spectrogram_bins = self.spectrogram_.shape[-1].value
        lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
        linear_to_mel_weight_matrix = \
            tf.contrib.signal.linear_to_mel_weight_matrix(
                model_settings['dct_coefficient_count'],
                num_spectrogram_bins, model_settings['sample_rate'],
                lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(self.spectrogram_,
                                        linear_to_mel_weight_matrix, 1)
        mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
        self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
            log_mel_spectrograms
        )[:, :, :model_settings['num_log_mel_features']]  # :13
Пример #52
0
  def _evalDecodeJpeg(self,
                      image_name,
                      parallelism,
                      num_iters,
                      crop_during_decode=None,
                      crop_window=None,
                      tile=None):
    """Evaluate DecodeJpegOp for the given image.

    TODO(tanmingxing): add decoding+cropping as well.

    Args:
      image_name: a string of image file name (without suffix).
      parallelism: the number of concurrent decode_jpeg ops to be run.
      num_iters: number of iterations for evaluation.
      crop_during_decode: If true, use fused DecodeAndCropJpeg instead of
          separate decode and crop ops. It is ignored if crop_window is None.
      crop_window: if not None, crop the decoded image. Depending on
          crop_during_decode, cropping could happen during or after decoding.
      tile: if not None, tile the image to composite a larger fake image.

    Returns:
      The duration of the run in seconds.
    """
    ops.reset_default_graph()

    image_file_path = os.path.join(prefix_path, image_name)

    if tile is None:
      image_content = variable_scope.get_variable(
          'image_%s' % image_name,
          initializer=io_ops.read_file(image_file_path))
    else:
      single_image = image_ops.decode_jpeg(
          io_ops.read_file(image_file_path), channels=3, name='single_image')
      # Tile the image to composite a new larger image.
      tiled_image = array_ops.tile(single_image, tile)
      image_content = variable_scope.get_variable(
          'tiled_image_%s' % image_name,
          initializer=image_ops.encode_jpeg(tiled_image))

    with session.Session() as sess:
      sess.run(variables.global_variables_initializer())
      images = []
      for _ in xrange(parallelism):
        if crop_window is None:
          # No crop.
          image = image_ops.decode_jpeg(image_content, channels=3)
        elif crop_during_decode:
          # combined decode and crop.
          image = image_ops.decode_and_crop_jpeg(
              image_content, crop_window, channels=3)
        else:
          # separate decode and crop.
          image = image_ops.decode_jpeg(image_content, channels=3)
          image = image_ops.crop_to_bounding_box(
              image,
              offset_height=crop_window[0],
              offset_width=crop_window[1],
              target_height=crop_window[2],
              target_width=crop_window[3])

        images.append(image)
      r = control_flow_ops.group(*images)

      for _ in xrange(3):
        # Skip warm up time.
        sess.run(r)

      start_time = time.time()
      for _ in xrange(num_iters):
        sess.run(r)
    return time.time() - start_time
Пример #53
0
 def f2():
     # Some operations that XLA cannot compile.
     image_ops.decode_image(io_ops.read_file('/tmp/bmp'))
     return array_ops.constant(31)
Пример #54
0
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import io_ops
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from scipy.io import wavfile

filename = '/Users/tsingh1/Developer/kaggle/speech/data/train/audio/bed/d78858d9_nohash_1.wav'
filenameTensor = tf.constant(filename)
with tf.Session() as sess:
    wav_loader = io_ops.read_file(filenameTensor)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=5)
    x = wav_decoder.audio.eval().flatten()
    print('x1', x)
    print('x1', x.shape)

_, wav = wavfile.read(filename)
wav1 = wav.astype(np.float32) / np.iinfo(np.int16).max
print('w', wav)
print('w1', wav1)
print('w1', wav1.shape)
Пример #55
0
 def load_image(self, image_file, sess):
     image_op = image_ops.decode_png(io_ops.read_file(image_file),
                                     dtype=dtypes.uint8,
                                     channels=4)[:, :, 0:3]
     return sess.run(image_op)
 def load_image(self, image_file, sess):
   image_op = image_ops.decode_png(
       io_ops.read_file(image_file), dtype=dtypes.uint8, channels=4)[:, :, 0:3]
   return sess.run(image_op)