Python decode_audio 예제들, tensorflow.contrib.ffmpeg.decode_audio Python 예제들

예제 #1

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: raeidsaqur/tensorflow

 def testStaticShapeInference_NegativeChannelCountInvalid(self):
   with self.test_session():
     with six.assertRaisesRegex(self, Exception,
                                r'channel_count must be positive'):
       ffmpeg.decode_audio(b'~~~ wave ~~~',
                           file_format='wav',
                           samples_per_second=44100,
                           channel_count=-2)

예제 #2

0

파일 보기

 def testStaticShapeInference_NegativeChannelCountInvalid(self):
   with self.test_session():
     with six.assertRaisesRegex(self, Exception,
                                r'channel_count must be positive'):
       ffmpeg.decode_audio(b'~~~ wave ~~~',
                           file_format='wav',
                           samples_per_second=44100,
                           channel_count=-2)

예제 #3

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: 821760408-sp/tensorflow

 def testInvalidFile(self):
   with self.test_session():
     contents = 'invalid file'
     audio_op = ffmpeg.decode_audio(contents, file_format='wav',
                                    samples_per_second=10000, channel_count=2)
     audio = audio_op.eval()
     self.assertEqual(audio.shape, (0, 0))

예제 #4

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: raeidsaqur/tensorflow

 def testStaticShapeInference_ConstantChannelCount(self):
   with self.test_session():
     audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                    file_format='wav',
                                    samples_per_second=44100,
                                    channel_count=2)
     self.assertEqual([None, 2], audio_op.shape.as_list())

예제 #5

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: AlbertXiebnu/tensorflow

  def _loadFileAndTest(self, filename, file_format, duration_sec,
                       samples_per_second, channel_count):
    """Loads an audio file and validates the output tensor.

    Args:
      filename: The filename of the input file.
      file_format: The format of the input file.
      duration_sec: The duration of the audio contained in the file in seconds.
      samples_per_second: The desired sample rate in the output tensor.
      channel_count: The desired channel count in the output tensor.
    """
    with self.test_session():
      path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
                          filename)
      with open(path, 'rb') as f:
        contents = f.read()

      audio_op = ffmpeg.decode_audio(
          contents,
          file_format=file_format,
          samples_per_second=samples_per_second,
          channel_count=channel_count)
      audio = audio_op.eval()
      self.assertEqual(len(audio.shape), 2)
      self.assertNear(
          duration_sec * samples_per_second,
          audio.shape[0],
          # Duration should be specified within 10%:
          0.1 * audio.shape[0])
      self.assertEqual(audio.shape[1], channel_count)

예제 #6

0

파일 보기

 def testStaticShapeInference_ConstantChannelCount(self):
   with self.test_session():
     audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                    file_format='wav',
                                    samples_per_second=44100,
                                    channel_count=2)
     self.assertEqual([None, 2], audio_op.shape.as_list())

예제 #7

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: michaelasapp/python-mcparseface

    def _loadFileAndTest(self, filename, file_format, duration_sec,
                         samples_per_second, channel_count):
        """Loads an audio file and validates the output tensor.

    Args:
      filename: The filename of the input file.
      file_format: The format of the input file.
      duration_sec: The duration of the audio contained in the file in seconds.
      samples_per_second: The desired sample rate in the output tensor.
      channel_count: The desired channel count in the output tensor.
    """
        with self.test_session():
            path = os.path.join(resource_loader.get_data_files_path(),
                                'testdata', filename)
            with open(path, 'rb') as f:
                contents = f.read()

            audio_op = ffmpeg.decode_audio(
                contents,
                file_format=file_format,
                samples_per_second=samples_per_second,
                channel_count=channel_count)
            audio = audio_op.eval()
            self.assertEqual(len(audio.shape), 2)
            self.assertNear(
                duration_sec * samples_per_second,
                audio.shape[0],
                # Duration should be specified within 10%:
                0.1 * audio.shape[0])
            self.assertEqual(audio.shape[1], channel_count)

예제 #8

0

파일 보기

 def testStaticShapeInference_NonConstantChannelCount(self):
   with self.test_session():
     channel_count = array_ops.placeholder(dtypes.int32)
     audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                    file_format='wav',
                                    samples_per_second=44100,
                                    channel_count=channel_count)
     self.assertEqual([None, None], audio_op.shape.as_list())

예제 #9

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: raeidsaqur/tensorflow

 def testStaticShapeInference_NonConstantChannelCount(self):
   with self.test_session():
     channel_count = array_ops.placeholder(dtypes.int32)
     audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                    file_format='wav',
                                    samples_per_second=44100,
                                    channel_count=channel_count)
     self.assertEqual([None, None], audio_op.shape.as_list())

예제 #10

0

파일 보기

 def testInvalidFile(self):
     with self.test_session():
         contents = 'invalid file'
         audio_op = ffmpeg.decode_audio(contents,
                                        file_format='wav',
                                        samples_per_second=10000,
                                        channel_count=2)
         audio = audio_op.eval()
         self.assertEqual(audio.shape, (0, 0))

예제 #11

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: 568xiaoma/WeChatProgram

 def testRoundTripWithPlaceholderSampleRate(self):
     with self.test_session():
         placeholder = array_ops.placeholder(dtypes.int32)
         audio_op = ffmpeg.decode_audio(self._contents,
                                        file_format='wav',
                                        samples_per_second=placeholder,
                                        channel_count=1)
         encode_op = ffmpeg.encode_audio(audio_op,
                                         file_format='wav',
                                         samples_per_second=placeholder)
         encoded_contents = encode_op.eval(feed_dict={placeholder: 10000})
         self._compareWavFiles(self._contents, encoded_contents)

예제 #12

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: 568xiaoma/WeChatProgram

 def testRoundTrip(self):
     """Reads a wav file, writes it, and compares them."""
     with self.test_session():
         audio_op = ffmpeg.decode_audio(self._contents,
                                        file_format='wav',
                                        samples_per_second=10000,
                                        channel_count=1)
         encode_op = ffmpeg.encode_audio(audio_op,
                                         file_format='wav',
                                         samples_per_second=10000)
         encoded_contents = encode_op.eval()
         self._compareWavFiles(self._contents, encoded_contents)

예제 #13

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: Ajaycs99/tensorflow

 def testRoundTrip(self):
   """Reads a wav file, writes it, and compares them."""
   with self.cached_session():
     audio_op = ffmpeg.decode_audio(
         self._contents,
         file_format='wav',
         samples_per_second=10000,
         channel_count=1)
     encode_op = ffmpeg.encode_audio(
         audio_op, file_format='wav', samples_per_second=10000)
     encoded_contents = encode_op.eval()
     self._compareWavFiles(self._contents, encoded_contents)

예제 #14

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: Ajaycs99/tensorflow

 def testRoundTripWithPlaceholderSampleRate(self):
   with self.cached_session():
     placeholder = array_ops.placeholder(dtypes.int32)
     audio_op = ffmpeg.decode_audio(
         self._contents,
         file_format='wav',
         samples_per_second=placeholder,
         channel_count=1)
     encode_op = ffmpeg.encode_audio(
         audio_op, file_format='wav', samples_per_second=placeholder)
     encoded_contents = encode_op.eval(feed_dict={placeholder: 10000})
     self._compareWavFiles(self._contents, encoded_contents)

예제 #15

0

파일 보기

    def mp3_tensors_from_directory(directory,
                                   batch_size,
                                   channels=2,
                                   format='mp3',
                                   seconds=30,
                                   bitrate=16384):
        filenames = glob.glob(directory + "/**/*." + format)
        labels, total_labels = build_labels(sorted(glob.glob(directory +
                                                             "/*")))
        num_examples_per_epoch = 10000

        # Create a queue that produces the filenames to read.
        classes = [labels[f.split('/')[-2]] for f in filenames]
        print("Found files", len(filenames))

        filenames = tf.convert_to_tensor(filenames, dtype=tf.string)
        classes = tf.convert_to_tensor(classes, dtype=tf.int32)
        print("[0]", filenames[0], classes[0])

        input_queue = tf.train.slice_input_producer([filenames, classes])

        # Read examples from files in the filename queue.
        print("INPUT_QUEUE", input_queue[0])
        value = tf.read_file(input_queue[0])
        #preprocess = tf.read_file(input_queue[0]+'.preprocess')

        print("Preloaded data", value)
        #print("Loaded data", data)

        label = input_queue[1]

        min_fraction_of_examples_in_queue = 0.4
        min_queue_examples = int(num_examples_per_epoch *
                                 min_fraction_of_examples_in_queue)

        #data = tf.cast(data, tf.float32)
        data = ffmpeg.decode_audio(value,
                                   file_format=format,
                                   samples_per_second=bitrate,
                                   channel_count=channels)
        data = shared.resize_audio_patch.resize_audio_with_crop_or_pad(
            data, seconds * bitrate * channels, 0, True)
        #data = tf.slice(data, [0,0], [seconds*bitrate, channels])
        tf.Tensor.set_shape(data, [seconds * bitrate, channels])
        #data = tf.minimum(data, 1)
        #data = tf.maximum(data, -1)
        data = data / tf.reduce_max(tf.reshape(tf.abs(data), [-1]))
        print("DATA IS", data)
        x, y = _get_data(data, label, min_queue_examples, batch_size)

        return x, y, total_labels, num_examples_per_epoch

예제 #16

0

파일 보기

    def _loadFileAndTest(self,
                         filename,
                         file_format,
                         duration_sec,
                         samples_per_second,
                         channel_count,
                         samples_per_second_tensor=None,
                         feed_dict=None,
                         stream=None):
        """Loads an audio file and validates the output tensor.

    Args:
      filename: The filename of the input file.
      file_format: The format of the input file.
      duration_sec: The duration of the audio contained in the file in seconds.
      samples_per_second: The desired sample rate in the output tensor.
      channel_count: The desired channel count in the output tensor.
      samples_per_second_tensor: The value to pass to the corresponding
        parameter in the instantiated `decode_audio` op. If not
        provided, will default to a constant value of
        `samples_per_second`. Useful for providing a placeholder.
      feed_dict: Used when evaluating the `decode_audio` op. If not
        provided, will be empty. Useful when providing a placeholder for
        `samples_per_second_tensor`.
      stream: A string specifying which stream from the content file
        should be decoded. The default value is '' which leaves the
        decision to ffmpeg.
    """
        if samples_per_second_tensor is None:
            samples_per_second_tensor = samples_per_second
        with self.test_session():
            path = os.path.join(resource_loader.get_data_files_path(),
                                'testdata', filename)
            with open(path, 'rb') as f:
                contents = f.read()

            audio_op = ffmpeg.decode_audio(
                contents,
                file_format=file_format,
                samples_per_second=samples_per_second_tensor,
                channel_count=channel_count,
                stream=stream)
            audio = audio_op.eval(feed_dict=feed_dict or {})
            self.assertEqual(len(audio.shape), 2)
            self.assertNear(
                duration_sec * samples_per_second,
                audio.shape[0],
                # Duration should be specified within 10%:
                0.1 * audio.shape[0])
            self.assertEqual(audio.shape[1], channel_count)

예제 #17

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: HowieYang0/notmnist-ex

  def testRoundTrip(self):
    """Reads a wav file, writes it, and compares them."""
    with self.test_session():
      path = os.path.join(
          resource_loader.get_data_files_path(), 'testdata/mono_10khz.wav')
      with open(path, 'rb') as f:
        original_contents = f.read()

      audio_op = ffmpeg.decode_audio(
          original_contents, file_format='wav', samples_per_second=10000,
          channel_count=1)
      encode_op = ffmpeg.encode_audio(
          audio_op, file_format='wav', samples_per_second=10000)
      encoded_contents = encode_op.eval()
      self._compareWavFiles(original_contents, encoded_contents)

예제 #18

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: 0-T-0/tensorflow

  def testRoundTrip(self):
    """Fabricates some audio, creates a wav file, reverses it, and compares."""
    with self.test_session():
      path = os.path.join(
          resource_loader.get_data_files_path(), 'testdata/mono_10khz.wav')
      with open(path, 'r') as f:
        original_contents = f.read()

      audio_op = ffmpeg.decode_audio(
          original_contents, file_format='wav', samples_per_second=10000,
          channel_count=1)
      encode_op = ffmpeg.encode_audio(
          audio_op, file_format='wav', samples_per_second=10000)
      encoded_contents = encode_op.eval()
      self.assertEqual(original_contents, encoded_contents)

예제 #19

0

파일 보기

파일: data_processing.py 프로젝트: WheelerEarnest/funknet

def get_songs(folder, sample_rate):
    """
    Gather up all the singy-songs you want to train the net on
    :param sample_rate: An integer representing the samples per second
    :param folder: String of the path to the folder containing the data, put a / at the end
    :return: returns a TensorArray with the decoded audio
    """
    files = listdir(folder)
    songs = tf.TensorArray(tf.float32, size=len(files))

    for file_name, i in zip(files, range(len(files))):
        file = tf.read_file(folder + file_name)
        # I set the channel count to 1 because I am unsure how to make more work
        waveform = decode_audio(file, 'wav', sample_rate, channel_count=1)
        songs = songs.write(i, waveform)
    return songs

예제 #20

0

파일 보기

파일: encode_audio_op_test.py 프로젝트: peiyukuang/tensorflow

    def testRoundTrip(self):
        """Fabricates some audio, creates a wav file, reverses it, and compares."""
        with self.test_session():
            path = os.path.join(resource_loader.get_data_files_path(),
                                'testdata/mono_10khz.wav')
            with open(path, 'r') as f:
                original_contents = f.read()

            audio_op = ffmpeg.decode_audio(original_contents,
                                           file_format='wav',
                                           samples_per_second=10000,
                                           channel_count=1)
            encode_op = ffmpeg.encode_audio(audio_op,
                                            file_format='wav',
                                            samples_per_second=10000)
            encoded_contents = encode_op.eval()
            self.assertEqual(original_contents, encoded_contents)

예제 #21

0

파일 보기

파일: decode_audio_op_test.py 프로젝트: raeidsaqur/tensorflow

  def _loadFileAndTest(self, filename, file_format, duration_sec,
                       samples_per_second, channel_count,
                       samples_per_second_tensor=None, feed_dict=None,
                       stream=None):
    """Loads an audio file and validates the output tensor.

    Args:
      filename: The filename of the input file.
      file_format: The format of the input file.
      duration_sec: The duration of the audio contained in the file in seconds.
      samples_per_second: The desired sample rate in the output tensor.
      channel_count: The desired channel count in the output tensor.
      samples_per_second_tensor: The value to pass to the corresponding
        parameter in the instantiated `decode_audio` op. If not
        provided, will default to a constant value of
        `samples_per_second`. Useful for providing a placeholder.
      feed_dict: Used when evaluating the `decode_audio` op. If not
        provided, will be empty. Useful when providing a placeholder for
        `samples_per_second_tensor`.
      stream: A string specifying which stream from the content file
        should be decoded. The default value is '' which leaves the
        decision to ffmpeg.
    """
    if samples_per_second_tensor is None:
      samples_per_second_tensor = samples_per_second
    with self.test_session():
      path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
                          filename)
      with open(path, 'rb') as f:
        contents = f.read()

      audio_op = ffmpeg.decode_audio(
          contents,
          file_format=file_format,
          samples_per_second=samples_per_second_tensor,
          channel_count=channel_count, stream=stream)
      audio = audio_op.eval(feed_dict=feed_dict or {})
      self.assertEqual(len(audio.shape), 2)
      self.assertNear(
          duration_sec * samples_per_second,
          audio.shape[0],
          # Duration should be specified within 10%:
          0.1 * audio.shape[0])
      self.assertEqual(audio.shape[1], channel_count)

예제 #22

0

파일 보기

파일: audio_loader.py 프로젝트: 255BITS/hyperchamber-gan

    def mp3_tensors_from_directory(directory, batch_size, channels=2, format='mp3', seconds=30, bitrate=16384):
      filenames = glob.glob(directory+"/**/*."+format)
      labels,total_labels = build_labels(sorted(glob.glob(directory+"/*")))
      num_examples_per_epoch = 10000

      # Create a queue that produces the filenames to read.
      classes = [labels[f.split('/')[-2]] for f in filenames]
      print("Found files", len(filenames))

      filenames = tf.convert_to_tensor(filenames, dtype=tf.string)
      classes = tf.convert_to_tensor(classes, dtype=tf.int32)
      print("[0]", filenames[0], classes[0])

      input_queue = tf.train.slice_input_producer([filenames, classes])

      # Read examples from files in the filename queue.
      print("INPUT_QUEUE", input_queue[0])
      value = tf.read_file(input_queue[0])
      #preprocess = tf.read_file(input_queue[0]+'.preprocess')

      print("Preloaded data", value)
      #print("Loaded data", data)

      label = input_queue[1]

      min_fraction_of_examples_in_queue = 0.4
      min_queue_examples = int(num_examples_per_epoch *
                               min_fraction_of_examples_in_queue)

      #data = tf.cast(data, tf.float32)
      data = ffmpeg.decode_audio(value, file_format=format, samples_per_second=bitrate, channel_count=channels)
      data = shared.resize_audio_patch.resize_audio_with_crop_or_pad(data, seconds*bitrate*channels, 0,True)
      #data = tf.slice(data, [0,0], [seconds*bitrate, channels])
      tf.Tensor.set_shape(data, [seconds*bitrate, channels])
      #data = tf.minimum(data, 1)
      #data = tf.maximum(data, -1)
      data = data/tf.reduce_max(tf.reshape(tf.abs(data),[-1]))
      print("DATA IS", data)
      x,y=_get_data(data, label, min_queue_examples, batch_size)

      return x, y, total_labels, num_examples_per_epoch

예제 #23

0

파일 보기

def loadfiles(fname):
	binary = tf.read_file(fname)
	print ("binary is:     ", binary)
	return ffmpeg.decode_audio(binary, file_format='wav', samples_per_second=48000, channel_count=2)

예제 #24

0

파일 보기

파일: freeze.py 프로젝트: anushabvs/ML_Project_Speech_Recognition

def create_inference_graph(wanted_words,
                           sample_rate,
                           clip_duration_ms,
                           window_size_ms,
                           window_stride_ms,
                           dct_coefficient_count,
                           model_architecture,
                           model_size_info=None):
    """Creates an audio model with the nodes needed for inference.
    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.
    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      clip_stride_ms: How often to run recognition. Useful for models with cache.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
    """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    if (model_architecture == 'dnc'):
        model_settings['batch_size'] = 1

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    audio_binary = tf.read_file(wav_data_placeholder)
    decoded_sample_data = ffmpeg.decode_audio(
        audio_binary,
        file_format='wav',
        samples_per_second=model_settings['desired_samples'],
        channel_count=1)
    decoded_sample_data = tf.reshape(decoded_sample_data,
                                     shape=(model_settings['desired_samples']))
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info=model_size_info,
                                 is_training=False)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')

예제 #25

0

파일 보기

파일: input_fn.py 프로젝트: trungthanhnguyen0502/tacotron-2-explained

def parse_csv_line(line, vocabulary, config):
    # tf.decode_csv converts CSV records to tensors. Not read CSV files!
    # Standard procedure to read any file is with tf.data.TextLineDataset
    # After reading the file into a tensor (NUM_LINES x 1), we interpret the tensor as being in CSV format
    # Each line in that tensor is a scalar string
    # Which means we assume every row of tensor (corresponding to every line in file) has
    # multiple columns delimited by the specified delimiter
    # The output we get is a tensor (NUM_LINES, NUM_COLUMNS)
    fields = tf.decode_csv(line, config['data']['csv_column_defaults'])

    # Note that INPUT_CSV_COLUMNS is (1 x NUM_COLUMNS) while fields is (NUM_LINES, NUM_COLUMNS)
    # So zipping gives NUM_COLUMNS tuples (COLUMN_NAME, (NUM_LINES x 1)), from which we create a dict
    features = dict(zip(config['data']['csv_columns'], fields))

    # Split string into characters
    # IMPORTANT NOTE: tf.string_split returns a SparseTensor of rank 2,
    # the strings split according to the delimiter. Read more about how SparseTensors are represented
    text = tf.string_split([features[config['data']['csv_columns'][0]]],
                           delimiter="")

    # Once we have character SparseTensors, we need to encode the characters as numbers
    # Traditional way is to have one hot encoding or a one hot encoding + embedding matrix
    # When you use one hot encoding + embedding matrix, you are basically choosing a row of embedding matrix
    # So to make it faster, tensorflow expects input to embedding layer as the index of the row,
    # instead of having one hot vectors to be multiplied with embedding matrix
    # So we will maintain a Vocabulary where every character we care about has an associated number as 1-to-1
    # This looks like a map operation for which tensorflow has tf.map_fn

    # Now note that SparseTensors do not support all usual Tensor operations
    # To use tf.map_fn on a SparseTensor, we have to create a new SparseTensor in the following way

    # Also note that embedding layer will expect indexes of dtype tf.int64
    # Also, the vocabulary dict stores values as int64

    text_idx = tf.SparseTensor(
        text.indices,
        tf.map_fn(vocabulary.text2idx, text.values, dtype=tf.int64),
        text.dense_shape)

    # We have to convert this SparseTensor back to dense to support future operations
    text_idx = tf.sparse_tensor_to_dense(text_idx)  # Shape - (1, T)
    text_idx = tf.squeeze(text_idx)  # Shape - (T,)

    # We also require lengths of every input sequence as inputs to model
    # This ia because we will create batches of variable length input
    # where all sequences are forced to same length by padding at the end with 0s
    # This batch will be passed to an Dynamic RNN which will use sequence lengths
    # to mask the outputs appropriately. The RNN will be unrolled to the common length though
    # This method enables us to do mini batch SGD for variable length inputs

    input_sequence_lengths = tf.size(text_idx)  # Scalar

    # We are done with processing text (which is out input to Tacotron)
    # Lets move onto audio (which will be our targets)
    # This part is standard code for obtaining MFCC from audio as given in TF documentation
    # You can read more about what are fourier transform, spectrograms and MFCCs to get an idea

    audio_binary = tf.read_file(features[config['data']['csv_columns'][1]])

    # Sample rate used in paper is 16000, channel count should be 1 for tacotron 2
    # STFT configuration values specified in paper
    waveform = ffmpeg.decode_audio(
        audio_binary,
        file_format='wav',
        samples_per_second=config['data']['wav_sample_rate'],
        channel_count=1)

    stfts = tf.contrib.signal.stft(tf.transpose(waveform),
                                   frame_length=config['data']['frame_length'],
                                   frame_step=config['data']['frame_step'],
                                   fft_length=config['data']['fft_length'])
    magnitude_spectrograms = tf.abs(stfts)
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value

    # These are to be set according to human speech. Values specified in the paper
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = config['data']['lower_edge_hertz'], \
                                                       config['data']['upper_edge_hertz'], \
                                                       config['data']['num_mel_bins']

    linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, config['data']['wav_sample_rate'],
        lower_edge_hertz, upper_edge_hertz)
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)

    mel_spectrograms = tf.squeeze(
        mel_spectrograms)  # Removes all dimensions that are 1

    # This finishes processing of audio
    # Now we build the targets and inputs to the decoder

    # We append a frame of 0s at the end of targets to signal end of target
    end_tensor = tf.tile([[0.0]],
                         multiples=[1, tf.shape(mel_spectrograms)[-1]])
    targets = tf.concat([mel_spectrograms, end_tensor], axis=0)

    # We append a frame of 0s at the start of decoder_inputs to set input at t=1
    start_tensor = tf.tile([[0.0]],
                           multiples=[1, tf.shape(mel_spectrograms)[-1]])
    target_inputs = tf.concat([start_tensor, mel_spectrograms], axis=0)

    # Again, we require lengths of every target sequence as inputs to model
    # This ia because we will create batches of variable length input
    # where all sequences are forced to same length by padding at the end with 0s
    # This batch will be passed to an Dynamic RNN which will use sequence lengths
    # to mask the outputs appropriately. The RNN will be unrolled to the common length though
    # This method enables us to do mini batch SGD for variable length inputs
    target_sequence_lengths = tf.shape(targets)[0]

    # Now we return the values that our model requires as a dict (just like old feed_dict structure)
    return {
        'inputs': text_idx,
        'targets': targets,
        'input_sequence_lengths': input_sequence_lengths,
        'target_sequence_lengths': target_sequence_lengths,
        'target_inputs': target_inputs,
        'debug_data': waveform
    }

예제 #26

0

파일 보기

#https://www.tensorflow.org/api_guides/python/contrib.ffmpeg
import tensorflow as tf
from tensorflow.contrib import ffmpeg

audio_binary = tf.read_file('shibuya.mp3')
waveform = ffmpeg.decode_audio(audio_binary,
                               file_format='mp3',
                               samples_per_second=44100,
                               channel_count=2)
uncompressed_binary = ffmpeg.encode_audio(waveform,
                                          file_format='wav',
                                          samples_per_second=44100)

print(waveform)
print(uncompressed_binary)