示例#1
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate mfcc features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            mfcc features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('mfcc'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum_feats = self.spect(audio_data, sample_rate)
                spectrum_feats = tf.expand_dims(spectrum_feats, 0)
                fbank_feats = self.fbank(audio_data, sample_rate)
                mfcc = py_x_ops.mfcc(fbank_feats,
                                     spectrum_feats,
                                     sample_rate,
                                     use_energy=p.use_energy,
                                     cepstral_lifter=p.cepstral_lifter,
                                     coefficient_count=p.coefficient_count)
                return mfcc
示例#2
0
  def call(self, audio_data, sample_rate=None):
    """
    Caculate cepstrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum
          (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech.
    """

    p = self.config

    with tf.name_scope('cepstrum'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        cepstrum = py_x_ops.cepstrum(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length,
            ceps_subband_num=p.ceps_subband_num,
            tag_ceps_mean_norm=p.tag_ceps_mean_norm)

        return cepstrum
示例#3
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum and phase spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: Two returns:
        power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            power spectrum and of every frame in speech.
        phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            phase spectrum and of every frame in speech.
    """

        p = self.config
        with tf.name_scope('analyfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                power_spectrum, phase_spectrum = py_x_ops.analyfiltbank(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return power_spectrum, phase_spectrum
示例#4
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum or log power spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
        or log power spectrum (output_type=2) of every frame in speech.
    """

        p = self.config
        with tf.name_scope('spectrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                spectrum = py_x_ops.spectrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    output_type=p.output_type,
                    snip_edges=p.snip_edges,
                    raw_energy=p.raw_energy,
                    preEph_coeff=p.preeph_coeff,
                    window_type=p.window_type,
                    remove_dc_offset=p.remove_dc_offset,
                    is_fbank=p.is_fbank)

                return spectrum
示例#5
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power spectrum or log power spectrum of audio data.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return: A float tensor of size N containing add-noise audio.
        """

    p = self.config
    with tf.name_scope('add_rir_noise_aecres'):
      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):
        sample_rate = tf.cast(sample_rate, dtype=float)
        add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres(
            audio_data,
            sample_rate,
            if_add_rir=p.if_add_rir,
            rir_filelist=p.rir_filelist,
            if_add_noise=p.if_add_noise,
            snr_min=p.snr_min,
            snr_max=p.snr_max,
            noise_filelist=p.noise_filelist,
            if_add_aecres=p.if_add_aecres,
            aecres_filelist=p.aecres_filelist)

        return tf.squeeze(add_rir_noise_aecres_out)
示例#6
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
示例#7
0
    def call(self, power_spectrum, phase_spectrum, sample_rate=None):
        """
    Implement frequency domain to time domain conversion.
    :param power_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param phase_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param sample_rate: a scalar tensor.
    :return: audio data
    """

        p = self.config
        with tf.name_scope('synthfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                audio_data = py_x_ops.synthfiltbank(
                    power_spectrum,
                    phase_spectrum,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return audio_data
示例#8
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            fbank features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('fbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            if p.upper_frequency_limit <= 0:
                p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit
            elif (p.upper_frequency_limit <= p.lower_frequency_limit) or (
                    p.upper_frequency_limit > p.sample_rate / 2.0):
                p.upper_frequency_limit = p.sample_rate / 2.0

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum = self.spect(audio_data, sample_rate)
                spectrum = tf.expand_dims(spectrum, 0)

                fbank = py_x_ops.fbank(
                    spectrum,
                    sample_rate,
                    upper_frequency_limit=p.upper_frequency_limit,
                    lower_frequency_limit=p.lower_frequency_limit,
                    filterbank_channel_count=p.filterbank_channel_count)

                return fbank
示例#9
0
    def call(self, audio_data, sample_rate=None):
        """
    Calculate the zero-crossing rate of speech.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech.
    """

        p = self.config
        with tf.name_scope('zcr'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                zcr = py_x_ops.zcr(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length)

                return zcr
示例#10
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power of every frame in speech.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return:A float tensor of size (1, num_frames) containing power of every frame in speech.
        """

    p = self.config
    with tf.name_scope('framepow'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        framepow = py_x_ops.frame_pow(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length)

        return framepow
示例#11
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate plp features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('plp'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                plp = py_x_ops.plp(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length,
                                   plp_order=p.plp_order)
                return plp
示例#12
0
def accuracy(logits, labels):
    ''' accuracy candies
  params:
    logits: [B, ..., D]
    labels: [B, ...]
  return:
    accuracy tensor
  '''
    with tf.name_scope('accuracy'):
        assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1)
        assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels))
        with tf.control_dependencies([assert_rank, assert_shape]):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
            labels = tf.cast(labels, tf.int64)
            return tf.reduce_mean(
                tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
示例#13
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
示例#14
0
 def call(self, wavfile):
     """
 Get audio data and sample rate from a wavfile.
 :param wavfile: filepath of wav
 :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav
     file, which is a tensor with float dtype.
 """
     p = self.config
     contents = tf.io.read_file(wavfile)
     audio_data, sample_rate = tf.audio.decode_wav(
         contents, desired_channels=p.audio_channels)
     assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                 tf.cast(sample_rate, dtype=float))
     with tf.control_dependencies([assert_op]):
         return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate,
                                                         dtype=float)
示例#15
0
  def call(self, filename, audio_data, sample_rate=None):
    """
    Write wav using audio_data[tensor].
    :param filename: filepath of wav.
    :param audio_data: a tensor containing data of a wav.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: write wav opration.
    """
    p = self.config
    filename = tf.constant(filename)

    if sample_rate == None:
      sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

    assert_op = tf.assert_equal(
        tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
    with tf.control_dependencies([assert_op]):
      audio_data = tf.cast(audio_data, dtype=tf.float32)
      contents = tf.audio.encode_wav(
          tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32))
      w = tf.io.write_file(filename, contents)

    return w