예제 #1
0
def cross_entropy(logits,
                  labels,
                  input_length=None,
                  label_length=None,
                  smoothing=0.0,
                  reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
    '''
  cross entropy function for classfication and seq classfication
  :param, label_length, for seq task, this for target seq length, e.g. a b c </s>, 4
  '''
    del input_length

    onehot_labels = tf.cond(pred=tf.equal(
        tf.rank(logits) - tf.rank(labels), 1),
                            true_fn=lambda: tf.one_hot(
                                labels, tf.shape(logits)[-1], dtype=tf.int32),
                            false_fn=lambda: labels)

    if label_length is not None:
        weights = utils.len_to_mask(label_length)
    else:
        weights = 1.0

    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels,
                                           logits=logits,
                                           weights=weights,
                                           label_smoothing=smoothing,
                                           reduction=reduction)

    return loss
예제 #2
0
    def test_generate_data(self):
        for batch_mode in [True, False]:
            task_name = self.config['data']['task']['name']
            self.config['data']['task']['batch_mode'] = batch_mode
            self.config['data']['task']['dummy'] = False
            task = registers.task[task_name](self.config, self.mode)

            with self.cached_session(use_gpu=False, force_gpu=False):
                for uttid, feats, src_lens, targets, tgt_lens in task.generate_data(
                ):
                    logging.debug('uttid : {}'.format(uttid))
                    logging.debug("feats : {}, shape : {}".format(
                        feats, feats.shape))
                    logging.debug("targets : {}, shape : {}".format(
                        targets, targets.shape))
                    logging.debug('src_len : {}'.format(src_lens))
                    logging.debug('tgt_len : {}'.format(tgt_lens))
                    self.assertDTypeEqual(feats, np.float32)
                    self.assertDTypeEqual(src_lens, np.int64)
                    self.assertDTypeEqual(targets, np.int64)
                    self.assertDTypeEqual(tgt_lens, np.int64)

                    if batch_mode:
                        self.assertEqual(len(uttid.shape), 1)
                        self.assertEqual(len(feats.shape), 3)
                        self.assertEqual(len(targets.shape), 2)
                        self.assertEqual(len(src_lens.shape), 1)
                        self.assertEqual(len(tgt_lens.shape), 1)
                    else:
                        self.assertEqual(tf.rank(uttid).numpy(), 0)
                        self.assertEqual(len(feats.shape), 2)
                        self.assertEqual(len(targets.shape), 1)
                        self.assertEqual(tf.rank(src_lens).numpy(), 0)
                        self.assertEqual(tf.rank(tgt_lens).numpy(), 0)
예제 #3
0
 def test_detla_delta(self):
     ''' test delta delta'''
     with self.cached_session(use_gpu=False, force_gpu=False):
         feat = tf.constant(self.data[None, :], dtype=tf.float32)
         output = py_x_ops.delta_delta(feat,
                                       order=self.order,
                                       window=self.window)
         self.assertEqual(tf.rank(output).eval(), tf.rank(feat).eval())
         self.assertEqual(output.shape,
                          (1, self.feat_dim * (self.order + 1)))
         self.assertAllClose(output.eval(), self.output_true[None, :])
예제 #4
0
    def test_framepow(self):
        wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)
            input_data = input_data / 32768

            framepow = Framepow.params({
                'window_length': 0.025,
                'frame_length': 0.010
            }).instantiate()
            framepow_test = framepow(input_data, sample_rate)

            output_true = np.array([
                0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010,
                0.000008, 0.000009, 0.000009, 0.000009, 0.000009, 0.000011,
                0.090164, 0.133028, 0.156547, 0.053551, 0.056670, 0.097706,
                0.405659, 2.119505, 4.296845, 6.139090, 6.623638, 6.136467,
                7.595072, 7.904415, 7.655983, 6.771016, 5.706427, 4.220942,
                3.259599, 2.218259, 1.911394, 2.234246, 3.056905, 2.534153,
                0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266,
                0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505,
                0.152856, 0.045305
            ])

            self.assertEqual(tf.rank(framepow_test).eval(), 1)
            self.assertAllClose(framepow_test.eval().flatten()[:50],
                                output_true)
예제 #5
0
    def test_sfb(self):
        ''' test sfb op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            power_spc, phase_spc = py_x_ops.analyfiltbank(
                input_data, sample_rate)

            logging.info('Shape of power_spc: {}'.format(
                power_spc.eval().shape))
            logging.info('Shape of phase_spc: {}'.format(
                phase_spc.eval().shape))

            output = py_x_ops.synthfiltbank(power_spc.eval(), phase_spc.eval(),
                                            sample_rate)

            self.assertEqual(tf.rank(output).eval(), 1)
            logging.info('Shape of recovered signal: {}'.format(
                output.eval().shape))

            # beginning 400 samples are different, due to the overlap and add
            self.assertAllClose(output.eval().flatten()[500:550],
                                input_data[500:550],
                                rtol=1e-4,
                                atol=1e-4)
예제 #6
0
  def test_fbank(self):
    wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)
      config = {
          'window_length': 0.025,
          'output_type': 1,
          'frame_length': 0.010,
          'snip_edges': True
      }
      fbank = Fbank.params(config).instantiate()
      fbank_test = fbank(input_data, sample_rate)

      self.assertEqual(tf.rank(fbank_test).eval(), 3)

      real_fank_feats = np.array(
          [[3.768338, 4.946218, 6.289874, 6.330853, 6.761764, 6.884573],
           [3.803553, 5.450971, 6.547878, 5.796172, 6.397846, 7.242926]])

      self.assertAllClose(
          np.squeeze(fbank_test.eval()[0:2, 0:6, 0]),
          real_fank_feats,
          rtol=1e-05,
          atol=1e-05)
예제 #7
0
def accuracy(logits, labels):
    ''' accuracy candies
  params:
    logits: [B, ..., D]
    labels: [B, ...]
  return:
    accuracy tensor
  '''
    with tf.name_scope('accuracy'):
        assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1)
        assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels))
        with tf.control_dependencies([assert_rank, assert_shape]):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
            labels = tf.cast(labels, tf.int64)
            return tf.reduce_mean(
                tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
예제 #8
0
  def test_spectrum(self):
    wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)

      pitch = Pitch.params({
          'window_length': 0.025,
          'soft_min_f0': 10.0
      }).instantiate()
      pitch_test = pitch(input_data, sample_rate)

      self.assertEqual(tf.rank(pitch_test).eval(), 2)

      output_true = np.array(
        [
          [0.03881124, 0.3000031, - 0.02324523],
          [0.006756478, 0.3000097, 0.01047742],
          [0.02455365, 0.3000154, 0.00695902],
          [0.02453586, 0.3000221, 0.008448198],
          [0.03455311, 0.3000307, - 0.07547269],
          [0.04293294, 0.3000422, - 0.04193667]
        ]
      )
예제 #9
0
  def test_plp(self):
    wav_path = str(
        Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)
      input_data = input_data / 32768

      plp = Plp.params({
          'window_length': 0.025,
          'frame_length': 0.010,
          'plp_order': 12
      }).instantiate()
      plp_test = plp(input_data, sample_rate)

      output_true = np.array(
          [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118],
           [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854],
           [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310],
           [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247],
           [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]])

      self.assertEqual(tf.rank(plp_test).eval(), 2)
      # Because the povey window is used instead of the hamming window in spectrum.
      self.assertAllClose(
          plp_test.eval()[50:55, 5:10], output_true, rtol=1e-02, atol=1e-02)
예제 #10
0
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0):
  '''
  ctc loss function
  psram: logits, (B, T, D)
  psram: input_length,  (B, 1), input length of encoder
  psram: labels, (B, T)
  psram: label_length,  (B, 1), label length for convert dense label to sparse
  returns: loss, scalar
  '''
  ilen = tf.cond(
      pred=tf.equal(tf.rank(input_length), 1),
      true_fn=lambda: input_length,
      false_fn=lambda: tf.squeeze(input_length),
  )
  ilen = tf.cast(ilen, tf.int32)

  olen = tf.cond(
      pred=tf.equal(tf.rank(label_length), 1),
      true_fn=lambda: label_length,
      false_fn=lambda: tf.squeeze(label_length))
  olen = tf.cast(olen, tf.int32)

  deps = [
      tf.assert_rank(labels, 2, name='label_rank_check'),
      tf.assert_rank(logits, 3, name='logits_rank_check'),
      tf.assert_rank(ilen, 1, name='src_len_rank_check'),  # input_length
      tf.assert_rank(olen, 1, name='tgt_len_rank_check'),  # output_length
  ]

  labels, logits = ctc_data_transform(labels, logits, blank_index)

  with tf.control_dependencies(deps):
    # (B, 1)
    # blank index is consistent with Espnet, zero
    batch_loss = tf.nn.ctc_loss(
        labels=labels,
        inputs=logits,
        sequence_length=ilen,
        time_major=False,
        preprocess_collapse_repeated=False,
        ctc_merge_repeated=True,
        ignore_longer_outputs_than_inputs=False)
  return batch_loss
예제 #11
0
  def test_FbankPitch(self):
    wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)
      config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010}
      fbank_pitch = FbankPitch.params(config).instantiate()
      fbank_pitch_test = fbank_pitch(input_data, sample_rate)

      self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2)
      print(fbank_pitch_test.eval()[0:2])
예제 #12
0
  def test_fbank(self):
    wav_path = str(
        Path(PACKAGE_ROOT_DIR).joinpath(
            'layers/ops/data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)
      config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010}
      fbank = Fbank.params(config).instantiate()
      fbank_test = fbank(input_data, sample_rate)

      self.assertEqual(tf.rank(fbank_test).eval(), 3)
예제 #13
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    preemphasis=0.97,
                                    frame_length=0.025,
                                    frame_step=0.010,
                                    fft_length=None,
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.
  Args:
    waveforms: float32 tensor with shape [max_len, nchannels]
    sample_rate: sampling rate of the waveform
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [nchannles, max_len, num_bins]
  """
    del log_noise_floor, apply_mask
    spectrogram = powspec_feat(waveforms,
                               sr=sample_rate,
                               nfft=512 if not fft_length else fft_length,
                               winlen=frame_length,
                               winstep=frame_step,
                               lowfreq=lower_edge_hertz,
                               highfreq=upper_edge_hertz,
                               preemph=preemphasis)

    # [channels, time, feat_dim]
    fbank = fbank_feat(spectrogram,
                       sr=sample_rate,
                       feature_size=num_mel_bins,
                       nfft=512 if not fft_length else fft_length,
                       lowfreq=lower_edge_hertz,
                       highfreq=upper_edge_hertz)

    # [time, feat_dim]
    fbank = tf.cond(tf.equal(tf.rank(fbank), 3),
                    true_fn=lambda: fbank[0, :, :],
                    false_fn=lambda: fbank)
    return fbank
예제 #14
0
def delta_delta(feat, order=2):
    '''
  params:
    feat: a tensor of shape [nframe, nfbank] or [nframe, nfbank, 1]
  return: [nframe, nfbank, 3]
  '''
    feat = tf.cond(tf.equal(tf.rank(feat), 3),
                   true_fn=lambda: feat[:, :, 0],
                   false_fn=lambda: feat)

    shape = tf.shape(feat)
    # [nframe nfbank*3]
    nframe = shape[0]
    nfbank = shape[1]
    delta = py_x_ops.delta_delta(feat, order=order)
    feat_with_delta_delta = tf.reshape(delta, (nframe, nfbank, (order + 1)))
    return feat_with_delta_delta
예제 #15
0
    def test_FbankPitch(self):
        wav_path = str(
            Path(os.environ['MAIN_ROOT']).joinpath(
                'delta/layers/ops/data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)
            config = {
                'window_length': 0.025,
                'output_type': 1,
                'frame_length': 0.010,
                'thres_autoc': 0.4
            }
            fbank_pitch = FbankPitch.params(config).instantiate()
            fbank_pitch_test = fbank_pitch(input_data)

            self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2)
예제 #16
0
    def test_framepow(self):
        wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)

            framepow = Framepow.params({
                'window_length': 0.025,
                'frame_length': 0.010
            }).instantiate()
            framepow_test = framepow(input_data, sample_rate)

            real_framepow_feats = np.array(
                [9.819611, 9.328745, 9.247337, 9.26451, 9.266059])

            self.assertEqual(tf.rank(framepow_test).eval(), 1)
            self.assertAllClose(framepow_test.eval()[0:5], real_framepow_feats)
예제 #17
0
    def test_cepstrum(self):
        ''' test cepstrum op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.cepstrum(input_data, sample_rate)

            #pylint: disable=bad-whitespace
            output_true = np.array(
                [[0.525808, 0.579537, 0.159656, 0.014726, -0.1866810],
                 [0.225988, 1.557304, 3.381828, 0.132935, 0.7128600],
                 [-1.832759, -1.045178, 0.753158, 0.116107, -0.9307780],
                 [-0.696277, 1.333355, 1.590942, 2.041829, -0.0805630],
                 [-0.377375, 2.984320, 0.036302, 3.676640, 1.1709290]])
            #pylint: enable=bad-whitespace

            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of cepstrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[15:20, 7:12], output_true)
예제 #18
0
    def test_fbank(self):
        ''' test fbank op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            data = np.arange(513)
            spectrogram = tf.constant(data[None, None, :], dtype=tf.float32)
            sample_rate = tf.constant(22050, tf.int32)
            output = py_x_ops.fbank(spectrogram,
                                    sample_rate,
                                    filterbank_channel_count=20)

            output_true = np.array([
                1.887894, 2.2693727, 2.576507, 2.8156495, 3.036504, 3.2296343,
                3.4274294, 3.5987632, 3.771217, 3.937401, 4.0988584, 4.2570987,
                4.4110703, 4.563661, 4.7140336, 4.8626432, 5.009346, 5.1539173,
                5.2992935, 5.442024
            ])
            self.assertEqual(tf.rank(output).eval(), 3)
            self.assertEqual(output.shape, (1, 1, 20))
            self.assertAllClose(output.eval(), output_true[None, None, :])
예제 #19
0
  def test_frmpow(self):
    ''' test frame_power op'''
    with self.cached_session(use_gpu=False, force_gpu=False):
      sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

      output = py_x_ops.frame_pow(input_data, sample_rate)

      output_true = np.array([
          0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008,
          0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028,
          0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845,
          6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016,
          5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905,
          2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266,
          0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856,
          0.045305
      ])
      self.assertEqual(tf.rank(output).eval(), 1)
      logging.info('Shape of frame_power: {}'.format(output.eval().shape))
      self.assertAllClose(output.eval().flatten()[:50], output_true)
예제 #20
0
    def test_spectrum(self):
        wav_path = str(
            Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)

            spectrum = Spectrum.params({'window_length': 0.025}).instantiate()
            spectrum_test = spectrum(input_data, sample_rate)

            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])

            self.assertEqual(tf.rank(spectrum_test).eval(), 2)
            self.assertAllClose(spectrum_test.eval()[4:9, 4:9], output_true)
예제 #21
0
  def test_plp(self):
    ''' test plp op'''
    with self.cached_session(use_gpu=False, force_gpu=False):
      sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

      output = py_x_ops.plp(input_data, sample_rate)

      #pylint: disable=bad-whitespace
      output_true = np.array(
          [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118],
           [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854],
           [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310],
           [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247],
           [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]])
      #pylint: enable=bad-whitespace

      self.assertEqual(tf.rank(output).eval(), 2)
      logging.info('Shape of PLP: {}'.format(output.shape))
      self.assertAllClose(
          output.eval()[50:55, 5:10], output_true, rtol=1e-05, atol=1e-05)
예제 #22
0
    def test_mfcc(self):
        wav_path = str(
            Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav'))

        with self.session():
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)
            # config = {}
            mfcc = Mfcc.params().instantiate()
            mfcc_test = mfcc(input_data, sample_rate)

            self.assertEqual(tf.rank(mfcc_test).eval(), 3)

            real_mfcc_feats = np.array(
                [[-30.58736, -7.088838, -10.67966, -1.646479, -4.36086],
                 [-30.73371, -6.128432, -7.930599, 3.208357, -1.086456]])

            self.assertAllClose(np.squeeze(mfcc_test.eval()[0, 0:2, 1:6]),
                                real_mfcc_feats,
                                rtol=1e-05,
                                atol=1e-05)
예제 #23
0
    def test_fbank(self):
        wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)
            config = {
                'window_length': 0.025,
                'output_type': 1,
                'frame_length': 0.010,
                'snip_edges': True
            }
            fbank = Fbank.params(config).instantiate()
            fbank_test = fbank(input_data, sample_rate)

            self.assertEqual(tf.rank(fbank_test).eval(), 3)

            if tf.executing_eagerly():
                print(fbank_test.numpy()[0:2, 0:6, 0])
            else:
                print(fbank_test.eval()[0:2, 0:6, 0])
예제 #24
0
  def test_spectrum(self):
    wav_path = str(
        Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)

      spectrum = Spectrum.params({
          'window_length': 0.025,
          'snip_edges': 1
      }).instantiate()
      spectrum_test = spectrum(input_data, sample_rate)

      output_true = np.array(
          [[9.819611, 2.84503, 3.660894, 2.7779, 1.212233],
           [9.328745, 2.553949, 3.276319, 3.000918, 2.499342]])

      self.assertEqual(tf.rank(spectrum_test).eval(), 2)
      self.assertAllClose(
          spectrum_test.eval()[0:2, 0:5], output_true, rtol=1e-05, atol=1e-05)
예제 #25
0
  def test_spectrum(self):
    wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

    with self.cached_session(use_gpu=False, force_gpu=False):
      read_wav = ReadWav.params().instantiate()
      input_data, sample_rate = read_wav(wav_path)

      pitch = Pitch.params({
          'window_length': 0.025,
          'soft_min_f0': 10.0
      }).instantiate()
      pitch_test = pitch(input_data, sample_rate)

      self.assertEqual(tf.rank(pitch_test).eval(), 2)

      output_true = [[-0.1366025, 143.8855], [-0.0226383, 143.8855],
                     [-0.08464742, 143.8855], [-0.08458386, 143.8855],
                     [-0.1208689, 143.8855]]

      self.assertAllClose(
          pitch_test.eval()[0:5, :], output_true, rtol=1e-05, atol=1e-05)
예제 #26
0
  def test_pitch(self):
    ''' test pitch op'''
    with self.cached_session(use_gpu=False, force_gpu=False):
      # read wave
      sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

      output = py_x_ops.pitch(input_data, sample_rate)

      output_true = np.array([
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          122.823532, 117.647057, 116.788322, 116.788322, 119.402985,
          119.402985, 119.402985, 119.402985, 119.402985, 123.076920,
          124.031006, 125.000000, 132.065216, 139.130432, 139.130432,
          137.931030, 126.108368, 114.285713, 115.107910, 122.070084,
          129.032257, 130.081299, 130.081299, 129.032257, 130.081299,
          131.147537, 129.032257, 125.000000, 120.300751, 115.107910
      ])
      self.assertEqual(tf.rank(output).eval(), 1)
      logging.info('Shape of pitch: {}'.format(output.eval().shape))
      self.assertAllClose(output.eval().flatten()[:50], output_true)
예제 #27
0
    def test_zcr(self):
        ''' test zcr op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.zcr(input_data, sample_rate)

            output_true = np.array([
                0.406250, 0.418750, 0.425000, 0.407500, 0.393750, 0.392500,
                0.388750, 0.417500, 0.427500, 0.456250, 0.447500, 0.386250,
                0.357500, 0.282500, 0.232500, 0.262500, 0.282500, 0.295000,
                0.220000, 0.157500, 0.125000, 0.107500, 0.100000, 0.092500,
                0.092500, 0.095000, 0.097500, 0.105000, 0.100000, 0.112500,
                0.120000, 0.132500, 0.130000, 0.135000, 0.112500, 0.120000,
                0.090000, 0.080000, 0.070000, 0.080000, 0.087500, 0.092500,
                0.097500, 0.097500, 0.112500, 0.090000, 0.065000, 0.087500,
                0.175000, 0.240000
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            logging.info('Shape of zero-cross-rate: {}'.format(
                output.eval().shape))
            self.assertAllClose(output.eval().flatten()[:50], output_true)
예제 #28
0
    def test_mfcc(self):
        wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))

        with self.cached_session(use_gpu=False, force_gpu=False):
            read_wav = ReadWav.params().instantiate()
            input_data, sample_rate = read_wav(wav_path)
            config = {'use_energy': True}
            mfcc = Mfcc.params(config).instantiate()
            mfcc_test = mfcc(input_data, sample_rate)

            self.assertEqual(tf.rank(mfcc_test).eval(), 3)

            real_mfcc_feats = np.array([[
                9.819611, -30.58736, -7.088838, -10.67966, -1.646479, -4.36086
            ], [
                9.328745, -30.73371, -6.128432, -7.930599, 3.208357, -1.086456
            ]])

            self.assertAllClose(np.squeeze(mfcc_test.eval()[0, 0:2, 0:6]),
                                real_mfcc_feats,
                                rtol=1e-05,
                                atol=1e-05)
예제 #29
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)
            logging.info(
                f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}"
            )
            self.assertEqual(sample_rate, 16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            #pylint: disable=bad-whitespace
            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])
            #pylint: enable=bad-whitespace

            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of spectrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[4:9, 4:9], output_true)
예제 #30
0
def fbank_feat(powspec,
               sr=8000,
               feature_size=40,
               nfft=512,
               lowfreq=0,
               highfreq=None):
    ''' powspec: [audio_channels, spectrogram_length, spectrogram_feat_dim]
      return : [auido_chnnels, nframe, nfbank]
  '''
    del nfft

    true_fn = lambda: tf.expand_dims(powspec, 0)
    false_fn = lambda: powspec
    powspec = tf.cond(tf.equal(tf.rank(powspec), 2), true_fn, false_fn)

    feat = py_x_ops.fbank(
        powspec,
        sr,
        filterbank_channel_count=feature_size,
        lower_frequency_limit=lowfreq,
        upper_frequency_limit=highfreq,
    )
    return feat