Exemplo n.º 1
0
    def test_sfb(self):
        ''' test sfb op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            power_spc, phase_spc = py_x_ops.analyfiltbank(
                input_data, sample_rate)

            logging.info('Shape of power_spc: {}'.format(
                power_spc.eval().shape))
            logging.info('Shape of phase_spc: {}'.format(
                phase_spc.eval().shape))

            output = py_x_ops.synthfiltbank(power_spc.eval(), phase_spc.eval(),
                                            sample_rate)

            self.assertEqual(tf.rank(output).eval(), 1)
            logging.info('Shape of recovered signal: {}'.format(
                output.eval().shape))

            # beginning 400 samples are different, due to the overlap and add
            self.assertAllClose(output.eval().flatten()[500:550],
                                input_data[500:550],
                                rtol=1e-4,
                                atol=1e-4)
Exemplo n.º 2
0
    def test_afb(self):
        ''' test afb op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            power_spc, phase_spc = py_x_ops.analyfiltbank(
                input_data, sample_rate)

            power_spc_true = np.array([
                0.000421823002, 0.000014681223, 0.000088715387, 0.000011405386,
                0.000029108920, 0.000016433882, 0.000009128947, 0.000016150383,
                0.000068095047, 0.000016092306, 0.000088840192, 0.000021255839,
                0.000033152886, 0.000005644561, 0.000012678992, 0.000009685464,
                0.000022561202, 0.000004176219, 0.000032476772, 0.000063007421,
                0.000001721088, 0.000003773108, 0.000012991571, 0.000006143227,
                0.000005361593, 0.000019796202, 0.000012828057, 0.000040009807,
                0.000009260243, 0.000060815764, 0.000036184814, 0.000018079394,
                0.000004533325, 0.000008295409, 0.000033129665, 0.000022150667,
                0.000020058087, 0.000000962711, 0.000017114238, 0.000007549510,
                0.000023227087, 0.000037615722, 0.000007189777, 0.000006701076,
                0.000016871410, 0.000018671506, 0.000006927207, 0.000004177695,
                0.000005777914, 0.000002745287
            ])

            phase_spc_true = np.array([
                3.141592741013, 0.017522372305, 2.614648103714, 1.024240016937,
                -0.082203239202, 0.177630946040, -0.947744905949,
                1.557014584541, -2.254315614700, -0.327101945877,
                -2.747241020203, -1.865882754326, -2.847117424011,
                -0.581349492073, -3.014511823654, 2.957268953323,
                1.846585988998, -1.926323652267, -2.718185901642,
                -2.704042911530, -0.473446547985, -2.938575029373,
                2.915200233459, -1.540565252304, -3.052149772644,
                2.665060997009, -2.724275827408, -2.989539623260,
                -2.875509977341, -2.549245357513, 2.585565090179,
                1.503721714020, 1.570051312447, 1.980712175369, 2.068141937256,
                -1.657162785530, 2.774835824966, -1.669888973236,
                -2.816159725189, 3.112393617630, -0.539753019810,
                2.466773271561, 2.961024999619, -1.002810001373,
                2.275165081024, -2.257984638214, -2.611628055573,
                -2.753412723541, -2.071642875671, -2.972373962402
            ])
            self.assertEqual(tf.rank(power_spc).eval(), 1)
            self.assertEqual(tf.rank(phase_spc).eval(), 1)
            #      logging.info('output1: {}'.format(output_1.eval().flatten()[:50]))
            #      logging.info('output2: {}'.format(output_2.eval().flatten()[:50]))
            self.assertAllClose(power_spc.eval().flatten()[:50],
                                power_spc_true)
            self.assertAllClose(phase_spc.eval().flatten()[:50],
                                phase_spc_true)
Exemplo n.º 3
0
    def test_plp(self):
        ''' test plp op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.plp(input_data, sample_rate)

            output_true = np.array(
                [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118],
                 [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854],
                 [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310],
                 [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247],
                 [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]])
            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of PLP: {}'.format(output.shape))
            self.assertAllClose(output.eval()[50:55, 5:10], output_true)
Exemplo n.º 4
0
    def test_cepstrum(self):
        ''' test cepstrum op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.cepstrum(input_data, sample_rate)

            output_true = np.array(
                [[0.525808, 0.579537, 0.159656, 0.014726, -0.1866810],
                 [0.225988, 1.557304, 3.381828, 0.132935, 0.7128600],
                 [-1.832759, -1.045178, 0.753158, 0.116107, -0.9307780],
                 [-0.696277, 1.333355, 1.590942, 2.041829, -0.0805630],
                 [-0.377375, 2.984320, 0.036302, 3.676640, 1.1709290]])
            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of cepstrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[15:20, 7:12], output_true)
Exemplo n.º 5
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])
            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of spectrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[4:9, 4:9], output_true)
    def test_frmpow(self):
        ''' test frame_power op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.frame_pow(input_data, sample_rate)

            output_true = np.array([
                0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010,
                0.000008, 0.000009, 0.000009, 0.000009, 0.000009, 0.000011,
                0.090164, 0.133028, 0.156547, 0.053551, 0.056670, 0.097706,
                0.405659, 2.119505, 4.296845, 6.139090, 6.623638, 6.136467,
                7.595072, 7.904415, 7.655983, 6.771016, 5.706427, 4.220942,
                3.259599, 2.218259, 1.911394, 2.234246, 3.056905, 2.534153,
                0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266,
                0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505,
                0.152856, 0.045305
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            self.assertAllClose(output.eval().flatten()[:50], output_true)
Exemplo n.º 7
0
  def test_pitch(self):
    ''' test pitch op'''
    with self.session():
      # read wave
      sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

      output = py_x_ops.pitch(input_data, sample_rate)

      output_true = np.array([
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
          122.823532, 117.647057, 116.788322, 116.788322, 119.402985,
          119.402985, 119.402985, 119.402985, 119.402985, 123.076920,
          124.031006, 125.000000, 132.065216, 139.130432, 139.130432,
          137.931030, 126.108368, 114.285713, 115.107910, 122.070084,
          129.032257, 130.081299, 130.081299, 129.032257, 130.081299,
          131.147537, 129.032257, 125.000000, 120.300751, 115.107910
      ])
      self.assertEqual(tf.rank(output).eval(), 1)
      self.assertAllClose(output.eval().flatten()[:50], output_true)
Exemplo n.º 8
0
    def test_plp(self):
        ''' test plp op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.plp(input_data, sample_rate)

            output_true = np.array([
                -0.000000, -0.959257, -0.095592, -0.219479, -0.104977,
                -0.185207, -0.153651, -0.081711, -0.156977, -0.072177,
                0.077400, 0.027594, 0.040156, -0.000000, -0.956464, -0.086729,
                -0.211084, -0.062403, -0.212304, -0.240348, -0.081032,
                -0.036527, -0.071906, 0.025969, 0.004119, 0.003473, -0.000000,
                -0.952486, -0.094521, -0.143834, -0.133079, -0.244882,
                -0.175419, -0.040801, -0.071001, -0.134758, 0.061415, 0.085666,
                0.012909, -0.000000, -0.928211, -0.108592, -0.249340,
                -0.141225, -0.199109, -0.081247, -0.044329, -0.140386,
                -0.174557, -0.045552
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            self.assertAllClose(output.eval().flatten()[:50], output_true)
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            output_true = np.array([
                -16.018925, -16.491777, -16.903442, -18.108875, -19.477205,
                -19.039738, -17.066263, -16.530647, -16.033670, -15.492795,
                -15.347169, -16.443783, -15.385968, -15.631793, -16.286760,
                -16.555447, -15.107640, -15.158586, -16.397518, -14.803325,
                -15.173873, -15.785010, -15.551179, -15.487743, -15.732930,
                -15.610220, -15.314099, -14.765355, -14.572725, -13.482535,
                -13.463938, -14.457010, -16.253452, -15.444997, -13.472414,
                -12.852523, -13.163157, -13.957175, -14.148843, -13.527264,
                -12.840333, -13.056757, -14.582790, -13.900843, -13.864534,
                -14.037180, -15.386706, -16.500109, -16.309618, -13.585808
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            self.assertAllClose(output.eval().flatten()[:50], output_true)
Exemplo n.º 10
0
    def test_zcr(self):
        ''' test zcr op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            output = py_x_ops.zcr(input_data, sample_rate)

            output_true = np.array([
                0.406250, 0.418750, 0.425000, 0.407500, 0.393750, 0.392500,
                0.388750, 0.417500, 0.427500, 0.456250, 0.447500, 0.386250,
                0.357500, 0.282500, 0.232500, 0.262500, 0.282500, 0.295000,
                0.220000, 0.157500, 0.125000, 0.107500, 0.100000, 0.092500,
                0.092500, 0.095000, 0.097500, 0.105000, 0.100000, 0.112500,
                0.120000, 0.132500, 0.130000, 0.135000, 0.112500, 0.120000,
                0.090000, 0.080000, 0.070000, 0.080000, 0.087500, 0.092500,
                0.097500, 0.097500, 0.112500, 0.090000, 0.065000, 0.087500,
                0.175000, 0.240000
            ])
            self.assertEqual(tf.rank(output).eval(), 1)
            logging.info('Shape of zero-cross-rate: {}'.format(
                output.eval().shape))
            self.assertAllClose(output.eval().flatten()[:50], output_true)
Exemplo n.º 11
0
    def test_spectrum(self):
        ''' test spectrum op'''
        with self.cached_session(use_gpu=False, force_gpu=False):
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)
            logging.info(
                f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}"
            )
            self.assertEqual(sample_rate, 16000)

            output = py_x_ops.spectrum(input_data, sample_rate)

            #pylint: disable=bad-whitespace
            output_true = np.array(
                [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686],
                 [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851],
                 [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926],
                 [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091],
                 [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]])
            #pylint: enable=bad-whitespace

            self.assertEqual(tf.rank(output).eval(), 2)
            logging.info('Shape of spectrum: {}'.format(output.shape))
            self.assertAllClose(output.eval()[4:9, 4:9], output_true)
    def test_afb(self):
        ''' test afb op'''
        with self.session():
            sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000)

            power_spc, phase_spc = py_x_ops.analyfiltbank(
                input_data, sample_rate)

            power_spc_true = np.array(
                [[
                    4.2182300e-04, 3.6964193e-04, 3.9906241e-05, 2.8196722e-05,
                    3.3976138e-04, 3.7671626e-04, 2.2727624e-04, 7.2495081e-05,
                    4.3451786e-05, 3.4654513e-06
                ],
                 [
                     1.4681223e-05, 2.8831255e-05, 3.5616580e-05,
                     3.9359711e-05, 1.2714787e-04, 1.2794189e-04,
                     3.6509471e-05, 1.7578101e-05, 5.9672035e-05, 2.9785692e-06
                 ],
                 [
                     8.8715387e-05, 6.0998322e-05, 2.7695101e-05,
                     1.6866413e-04, 4.6845453e-05, 3.3532990e-05,
                     5.7005627e-06, 5.1852752e-05, 1.8390550e-05, 8.3459439e-05
                 ],
                 [
                     1.1405386e-05, 1.8942148e-06, 1.6338145e-06,
                     1.8362705e-05, 8.4106450e-06, 4.4174294e-06,
                     3.6533682e-05, 5.0541588e-05, 1.6701326e-06, 1.8736981e-05
                 ],
                 [
                     2.9108920e-05, 1.6862698e-05, 3.3437627e-05,
                     6.9332527e-05, 5.0028186e-05, 5.9426224e-05,
                     2.1895030e-06, 2.3780794e-06, 4.7786685e-05, 7.3811811e-05
                 ],
                 [
                     1.6433882e-05, 9.5777386e-07, 2.0980822e-06,
                     4.8990279e-07, 1.4232077e-05, 1.5986938e-05,
                     2.9042780e-05, 1.1719906e-05, 2.4548817e-06, 5.3594176e-06
                 ],
                 [
                     9.1289467e-06, 9.4249899e-06, 7.4781286e-07,
                     1.8923520e-05, 6.5740237e-06, 4.3209452e-06,
                     3.9396346e-06, 1.2287317e-05, 4.6807354e-06, 5.8512210e-06
                 ],
                 [
                     1.6150383e-05, 2.6649790e-05, 1.8610657e-05,
                     2.2872716e-06, 1.4209920e-05, 2.3279742e-06,
                     6.6038615e-06, 2.6169775e-05, 2.8335158e-05, 1.7595910e-06
                 ],
                 [
                     6.8095047e-05, 9.1859045e-05, 2.6713702e-05,
                     3.0580850e-05, 1.4539381e-05, 4.2510033e-05,
                     2.2579852e-05, 1.4843822e-05, 2.0883192e-05, 6.0624756e-05
                 ],
                 [
                     1.6092306e-05, 1.4245335e-05, 2.4250150e-05,
                     6.0177539e-05, 6.7926321e-06, 3.4922948e-07,
                     2.1843030e-06, 8.5554876e-07, 2.6831965e-06, 2.0012436e-05
                 ]])

            phase_spc_true = np.array(
                [[
                    3.1415927, 3.1415927, 3.1415927, 0.0, 0.0, 0.0, 0.0, 0.0,
                    0.0, 3.1415927
                ],
                 [
                     0.01752237, 1.6688037, 1.4971976, 1.4470094, 2.0516894,
                     -2.3112175, -0.7115377, 2.9614341, -1.2494497, -0.7055688
                 ],
                 [
                     2.614648, 0.63351387, -2.0660093, 1.7626916, -1.1257634,
                     3.017448, -2.892095, -1.2209401, 1.7407895, -1.0281658
                 ],
                 [
                     1.02424, -1.8967879, -0.6139833, 2.587602, 3.0070715,
                     1.5781559, -1.899145, -1.1459525, -0.24284656, -0.8106653
                 ],
                 [
                     -0.08220324, 0.5497215, 1.7031444, -2.8960562, -1.3680246,
                     0.4349923, 2.0676146, 1.2389332, 2.6312854, -1.7511902
                 ],
                 [
                     0.17763095, 2.7475302, -0.20671827, 1.0719725, -2.388657,
                     1.189566, -1.0643665, 2.5955305, -0.69036585, -0.5287417
                 ],
                 [
                     -0.9477449, -2.7059674, 0.53469753, 1.9289348, 0.24833842,
                     0.03517391, -1.4778724, -0.16577117, -1.7509687,
                     -0.46875867
                 ],
                 [
                     1.5570146, -2.9596932, -0.7975963, 3.0060582, -1.038453,
                     0.14911443, -1.5873562, 0.7229206, 2.679422, -1.1890441
                 ],
                 [
                     -2.2543156, 0.47845784, -2.8412538, -0.5494534, 1.6583048,
                     -1.4567885, 1.0724461, -2.70243, -0.2690962, 1.8831034
                 ],
                 [
                     -0.32710192, 0.01503609, 0.29720783, -0.7409194,
                     -2.183623, 2.3637679, 0.6405145, 1.4975713, 0.18241015,
                     2.2659144
                 ]])
            self.assertEqual(tf.rank(power_spc).eval(), 2)
            self.assertEqual(tf.rank(phase_spc).eval(), 2)
            logging.info('power_spc shape: {}'.format(power_spc.shape))
            logging.info('phase_spc shape: {}'.format(phase_spc.shape))
            self.assertAllClose(power_spc.eval().transpose()[:10, :10],
                                power_spc_true)
            self.assertAllClose(phase_spc.eval().transpose()[:10, :10],
                                phase_spc_true)
Exemplo n.º 13
0
    def generate_data(self):
        ''' generate one example'''
        use_text = self.taskconf['text']['enable']

        # total files
        total = len(self._train_by_filename.values())
        self._epoch += 1  # epcoh from 1

        batch = []
        np.random.shuffle(self.data_items)
        for i, (filename, examples) in enumerate(self.data_items):
            #logging.info("example info", filename, examples)

            # convert txt to ids
            if use_text:
                text = _load_text('.'.join(filename.split('.')[:-1]))
                text2id = self._word_table_lookup(text)
            else:
                text2id = np.array([0] * self._max_text_len)

            # gen audio or load feat
            if self._file_suffix == '.wav':
                sr, raw_samples = feat_lib.load_wav(filename)  #pylint: disable=invalid-name
                for label, seg, clip_id in examples:
                    # examples of one file
                    samples = raw_samples
                    if seg[2]:
                        samples = np.pad(samples, [0, seg[2]], mode='constant')
                    samples = samples[seg[0]:seg[1]]
                    assert len(samples) == self.example_len, "{} {}".format(
                        filename, seg)

                    labelid = self.class_id(label)

                    if self.use_distilling:
                        soft_label = self.teacher(feat)
                    else:
                        class_num = self.taskconf['classes']['num']
                        soft_label = [0] * class_num

                    if use_text:
                        if clip_id == 0:
                            # only add into batch when meet the first clip
                            batch.append((samples, text2id, labelid, filename,
                                          clip_id, soft_label))
                    else:
                        batch.append((samples, text2id, labelid, filename,
                                      clip_id, soft_label))

            else:
                feat = np.load(filename)

                # shape : [nframe, feat_size, 3]
                if self._feature_type:
                    fbank = feat_lib.add_delta_delta(feat,
                                                     self._feature_size,
                                                     order=2)
                    if self._input_channels == 1:
                        fbank = fbank[:, :, 0:1]
                else:
                    fbank = feat_lib.delta_delta(feat)

                for label, seg, clip_id in examples:
                    feat = fbank
                    #logging.info("feat shape: {}".format(feat.shape))

                    seg = list(map(self.sample_to_frame, seg))
                    if seg[2]:
                        # need padding
                        feat = np.pad(feat, [(0, seg[2]), (0, 0), (0, 0)],
                                      mode='constant')

                    feat = feat[seg[0]:seg[1], :, :]
                    assert len(feat) == self.sample_to_frame(
                        self.example_len), "{} {} {} {} {} {}".format(
                            filename, seg, len(feat), self.example_len,
                            self.sample_to_frame(self.example_len), seg[2])

                    if self.use_distilling:
                        soft_label = self.teacher(feat)
                    else:
                        class_num = self.taskconf['classes']['num']
                        soft_label = [0] * class_num

                    # convert string label to int label
                    labelid = self.class_id(label)

                    if use_text:
                        if clip_id == 0:
                            # only add into batch when meet the first clip
                            batch.append((feat, text2id, labelid, filename,
                                          clip_id, soft_label))
                    else:
                        batch.append((feat, text2id, labelid, filename,
                                      clip_id, soft_label))

            #if i % 100000:
            #  logging.info('epoch:{} iter exmaple:{} total:{} : {:.2f}%'.format(
            #     self._epoch, i, total, i * 100 / total))

            for inputs, texts, label, filepath, clip_id, soft_label in batch:
                yield inputs, texts, label, filepath, clip_id, soft_label

            batch.clear()

        logging.info("Out of range")
        raise StopIteration  #pylint: disable=stop-iteration-return