示例#1
0
def _create_topk_unique(inputs, k):
    """Creates the top k values in sorted order with indices."""
    height = inputs.shape[0]
    width = inputs.shape[1]
    neg_inf_r0 = tf.constant(-np.inf, dtype=tf.float32)
    ones = tf.ones([height, width], dtype=tf.float32)
    neg_inf_r2 = ones * neg_inf_r0
    inputs = tf.where(tf.is_nan(inputs), neg_inf_r2, inputs)

    tmp = inputs
    topk_r2 = tf.zeros([height, k], dtype=tf.float32)
    for i in range(k):
        kth_order_statistic = tf.reduce_max(tmp, axis=1, keepdims=True)
        k_mask = tf.tile(
            tf.expand_dims(tf.equal(tf.range(k), tf.fill([k], i)), 0),
            [height, 1])
        topk_r2 = tf.where(k_mask, tf.tile(kth_order_statistic, [1, k]),
                           topk_r2)
        ge_r2 = tf.greater_equal(inputs,
                                 tf.tile(kth_order_statistic, [1, width]))
        tmp = tf.where(ge_r2, neg_inf_r2, inputs)

    log2_ceiling = int(math.ceil(math.log(float(int(width)), 2)))
    next_power_of_two = 1 << log2_ceiling
    count_mask = next_power_of_two - 1
    mask_r0 = tf.constant(count_mask)
    mask_r2 = tf.fill([height, k], mask_r0)
    topk_r2_s32 = tf.bitcast(topk_r2, tf.int32)
    topk_indices_r2 = tf.bitwise.bitwise_and(topk_r2_s32, mask_r2)
    return topk_r2, topk_indices_r2
示例#2
0
    def test_labels_blankid_to_last(self):
        ''' unit test case for the labels_blankid_to_last interface '''
        with self.cached_session():

            with self.assertRaises(AssertionError) as assert_err:
                labels = ctc_utils.labels_blankid_to_last(labels=self.labels,
                                                          blank_index=0,
                                                          num_class=None)
            the_exception = assert_err.exception
            self.assertEqual(str(the_exception),
                             'The num_class should not be None!')

            labels = ctc_utils.labels_blankid_to_last(labels=tf.constant(
                self.labels),
                                                      blank_index=0,
                                                      num_class=6)
            labels_values = np.asarray([0, 0, 0, 2, 0, 0, 0])
            labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0],
                                       [1, 1], [1, 2]])
            labels_shape = np.asarray([2, 4])
            self.assertAllEqual(labels.eval().values, labels_values)
            self.assertAllEqual(labels.eval().indices, labels_index)
            self.assertAllEqual(labels.eval().dense_shape, labels_shape)

            labels = ctc_utils.labels_blankid_to_last(labels=tf.constant(
                self.labels),
                                                      blank_index=2,
                                                      num_class=6)
            labels_values = np.asarray([1, 1, 1, 2, 1, 1, 1])
            labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0],
                                       [1, 1], [1, 2]])
            labels_shape = np.asarray([2, 4])
            self.assertAllEqual(labels.eval().values, labels_values)
            self.assertAllEqual(labels.eval().indices, labels_index)
            self.assertAllEqual(labels.eval().dense_shape, labels_shape)
示例#3
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
示例#4
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
示例#5
0
    def call(self, power_spectrum, phase_spectrum, sample_rate=None):
        """
    Implement frequency domain to time domain conversion.
    :param power_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param phase_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param sample_rate: a scalar tensor.
    :return: audio data
    """

        p = self.config
        with tf.name_scope('synthfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                audio_data = py_x_ops.synthfiltbank(
                    power_spectrum,
                    phase_spectrum,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return audio_data
示例#6
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            fbank features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('fbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            if p.upper_frequency_limit <= 0:
                p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit
            elif (p.upper_frequency_limit <= p.lower_frequency_limit) or (
                    p.upper_frequency_limit > p.sample_rate / 2.0):
                p.upper_frequency_limit = p.sample_rate / 2.0

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum = self.spect(audio_data, sample_rate)
                spectrum = tf.expand_dims(spectrum, 0)

                fbank = py_x_ops.fbank(
                    spectrum,
                    sample_rate,
                    upper_frequency_limit=p.upper_frequency_limit,
                    lower_frequency_limit=p.lower_frequency_limit,
                    filterbank_channel_count=p.filterbank_channel_count)

                return fbank
示例#7
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum or log power spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
        or log power spectrum (output_type=2) of every frame in speech.
    """

        p = self.config
        with tf.name_scope('spectrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                spectrum = py_x_ops.spectrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    output_type=p.output_type,
                    snip_edges=p.snip_edges,
                    raw_energy=p.raw_energy,
                    preEph_coeff=p.preeph_coeff,
                    window_type=p.window_type,
                    remove_dc_offset=p.remove_dc_offset,
                    is_fbank=p.is_fbank)

                return spectrum
示例#8
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum and phase spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: Two returns:
        power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            power spectrum and of every frame in speech.
        phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            phase spectrum and of every frame in speech.
    """

        p = self.config
        with tf.name_scope('analyfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                power_spectrum, phase_spectrum = py_x_ops.analyfiltbank(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return power_spectrum, phase_spectrum
示例#9
0
    def call(self, audio_data, sample_rate=None):
        """
    Calculate the zero-crossing rate of speech.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech.
    """

        p = self.config
        with tf.name_scope('zcr'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                zcr = py_x_ops.zcr(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length)

                return zcr
示例#10
0
    def test_labels_last_to_blankid(self):
        ''' unit test case for the labels_last_to_blankid interface '''
        with self.cached_session():

            labels = ctc_utils.labels_last_to_blankid(labels=tf.constant(
                self.labels),
                                                      blank_index=0,
                                                      num_class=None)
            labels_values = np.asarray([2, 2, 2, 4, 2, 2, 2])
            labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0],
                                       [1, 1], [1, 2]])
            labels_shape = np.asarray([2, 4])
            self.assertAllEqual(labels.eval().values, labels_values)
            self.assertAllEqual(labels.eval().indices, labels_index)
            self.assertAllEqual(labels.eval().dense_shape, labels_shape)

            labels = ctc_utils.labels_last_to_blankid(labels=tf.constant(
                self.labels),
                                                      blank_index=2,
                                                      num_class=None)
            labels_values = np.asarray([1, 1, 1, 4, 1, 1, 1])
            labels_index = np.asarray([[0, 0], [0, 1], [0, 2], [0, 3], [1, 0],
                                       [1, 1], [1, 2]])
            labels_shape = np.asarray([2, 4])
            self.assertAllEqual(labels.eval().values, labels_values)
            self.assertAllEqual(labels.eval().indices, labels_index)
            self.assertAllEqual(labels.eval().dense_shape, labels_shape)
示例#11
0
def generate_synthetic_data(input_shape,
                            input_value=0,
                            input_dtype=None,
                            label_shape=None,
                            label_value=0,
                            label_dtype=None,
                            nepoch=None):
    """Create a repeating dataset with constant values.

  Args:
    input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
      the input data.
    input_value: Value of each input element.
    input_dtype: Input dtype. If None, will be inferred by the input value.
    label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
      the label data.
    label_value: Value of each input element.
    label_dtype: Input dtype. If None, will be inferred by the target value.
    nepoch: num of epochs. If None, will repeat forever.

  Returns:
    Dataset of tensors or tuples of tensors (if label_shape is set).
  """
    # TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
    element = input_element = nest.map_structure(
        lambda s: tf.constant(input_value, input_dtype, s), input_shape)

    if label_shape:
        label_element = nest.map_structure(
            lambda s: tf.constant(label_value, label_dtype, s), label_shape)
        element = (input_element, label_element)

    return tf.data.Dataset.from_tensors(element).repeat(nepoch)
示例#12
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power spectrum or log power spectrum of audio data.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return: A float tensor of size N containing add-noise audio.
        """

    p = self.config
    with tf.name_scope('add_rir_noise_aecres'):
      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):
        sample_rate = tf.cast(sample_rate, dtype=float)
        add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres(
            audio_data,
            sample_rate,
            if_add_rir=p.if_add_rir,
            rir_filelist=p.rir_filelist,
            if_add_noise=p.if_add_noise,
            snr_min=p.snr_min,
            snr_max=p.snr_max,
            noise_filelist=p.noise_filelist,
            if_add_aecres=p.if_add_aecres,
            aecres_filelist=p.aecres_filelist)

        return tf.squeeze(add_rir_noise_aecres_out)
示例#13
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate mfcc features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            mfcc features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('mfcc'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum_feats = self.spect(audio_data, sample_rate)
                spectrum_feats = tf.expand_dims(spectrum_feats, 0)
                fbank_feats = self.fbank(audio_data, sample_rate)
                mfcc = py_x_ops.mfcc(fbank_feats,
                                     spectrum_feats,
                                     sample_rate,
                                     use_energy=p.use_energy,
                                     cepstral_lifter=p.cepstral_lifter,
                                     coefficient_count=p.coefficient_count)
                return mfcc
示例#14
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power of every frame in speech.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return:A float tensor of size (1, num_frames) containing power of every frame in speech.
        """

    p = self.config
    with tf.name_scope('framepow'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        framepow = py_x_ops.frame_pow(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length)

        return framepow
示例#15
0
  def call(self, audio_data, sample_rate=None):
    """
    Caculate cepstrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum
          (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech.
    """

    p = self.config

    with tf.name_scope('cepstrum'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        cepstrum = py_x_ops.cepstrum(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length,
            ceps_subband_num=p.ceps_subband_num,
            tag_ceps_mean_norm=p.tag_ceps_mean_norm)

        return cepstrum
示例#16
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate plp features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('plp'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                plp = py_x_ops.plp(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length,
                                   plp_order=p.plp_order)
                return plp
示例#17
0
  def get_learning_rate(self):
    """Get the learning rate."""
    lrconf = self.config['solver']['optimizer']['learning_rate']
    learning_rate = lrconf['rate']
    learning_type = lrconf['type']

    #pylint: disable=invalid-name
    if learning_type == 'exp_decay':
      lr = tf.train.exponential_decay(
          learning_rate,
          tf.train.get_or_create_global_step(),
          lrconf['decay_steps'],
          lrconf['decay_rate'],
          staircase=True)
    elif learning_type == 'piecewise':
      #boundaries = [15000, 30000]
      #values = [1e-3, 1e-4, 1e-5]
      boundaries = lrconf['boundaries']
      values = lrconf['values']
      assert len(values) == len(
          boundaries) + 1, 'values len must equal boundaries len plus one'
      lr = tf.train.piecewise_constant(
          tf.train.get_or_create_global_step(),
          boundaries=boundaries,
          values=values)
    elif learning_type == 'warmup':
      learning_rate = tf.constant(
          value=learning_rate, shape=[], dtype=tf.float32)
      global_step = tf.train.get_or_create_global_step()
      data_size = self.config['data']['train_data_size']
      num_epochs = self.config["data"]["task"]['epochs']
      batch_size = self.config["data"]["task"]['batch_size']
      num_batch = int(math.ceil(data_size * num_epochs / batch_size))
      learning_rate = tf.train.polynomial_decay(
          learning_rate,
          global_step,
          num_batch,
          end_learning_rate=0.0,
          power=1.0,
          cycle=False)
      global_steps_int = tf.cast(global_step, tf.int32)
      warmup_steps_int = tf.constant(lrconf['num_warmup_steps'], dtype=tf.int32)

      global_steps_float = tf.cast(global_steps_int, tf.float32)
      warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

      warmup_percent_done = global_steps_float / warmup_steps_float
      warmup_learning_rate = learning_rate * warmup_percent_done

      is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
      lr = ((1.0 - is_warmup) * learning_rate +
            is_warmup * warmup_learning_rate)
    elif learning_type == 'const':
      lr = learning_rate
    else:
      raise ValueError(
          "Not support learning rate type: {}".format(learning_type))
    tf.summary.scalar('lr', lr)
    return lr
示例#18
0
 def begin(self):
     self._global_step_tensor = tf.train.get_or_create_global_step()
     if self._global_step_tensor is None:
         raise RuntimeError(
             "Global step should be created to use StopAtStepHook.")
     self._epoch_tensor = (self._global_step_tensor * tf.constant(
         self._num_examples_per_epoch)) / tf.constant(
             self._global_batch_size)
示例#19
0
    def curvature_range(self):
        # set up the curvature window
        self._curv_win = tf.Variable(np.zeros([
            self._curv_win_width,
        ]),
                                     dtype=tf.float32,
                                     name="curv_win",
                                     trainable=False)
        # we can use log smoothing for curvature range to follow trend faster
        # self._curv_win = tf.scatter_update(
        #   self._curv_win, self._global_step % self._curv_win_width,
        #   tf.log(self._grad_norm_squared + EPS))
        self._curv_win = tf.scatter_update(
            self._curv_win, self._global_step % self._curv_win_width,
            self._grad_norm_squared + EPS)
        # note here the iterations start from iteration 0
        valid_window = tf.slice(
            self._curv_win, tf.constant([
                0,
            ]),
            tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width),
                                      self._global_step + 1),
                           dim=0))

        if self._h_min_log_smooth:
            self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
        else:
            self._h_min_t = tf.reduce_min(valid_window)
        if self._h_max_log_smooth:
            self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
        else:
            self._h_max_t = tf.reduce_max(valid_window)

        curv_range_ops = []
        with tf.control_dependencies([self._h_min_t, self._h_max_t]):
            avg_op = self._moving_averager.apply(
                [self._h_min_t, self._h_max_t])
            with tf.control_dependencies([avg_op]):
                if self._h_min_log_smooth:
                    self._h_min = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_min_t)))
                else:
                    self._h_min = \
                      tf.identity(self._moving_averager.average(self._h_min_t))
                if self._h_max_log_smooth:
                    self._h_max = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_max_t)))
                else:
                    self._h_max = \
                      tf.identity(self._moving_averager.average(self._h_max_t))
            if self._sparsity_debias:
                self._h_min = self._h_min * self._sparsity_avg
                self._h_max = self._h_max * self._sparsity_avg
        curv_range_ops.append(avg_op)
        return curv_range_ops
示例#20
0
    def apply_gradients(self, grads_tvars, global_step=None, name=None):
        self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars
                                         if g is not None])

        # for manual gradient clipping
        if self._clip_thresh_var is not None:
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, self._clip_thresh_var)

        # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
        if self._use_adapt_grad_clip:
            thresh = tf.cond(
                self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self.
                                               _adapt_grad_clip_thresh**2),
                lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, thresh)

        with tf.variable_scope("before_apply"):
            before_apply_op = self.before_apply()

        with tf.variable_scope("update_hyper"):
            with tf.control_dependencies([before_apply_op]):
                update_hyper_op = self.update_hyper_param()

        with tf.variable_scope("apply_updates"):
            with tf.control_dependencies([update_hyper_op]):

                # clip exploding gradient according to h_max
                if self._use_adapt_grad_clip:
                    thresh = tf.cond(
                        tf.greater(tf.global_norm(self._grads),
                                   self._adapt_grad_clip_thresh),
                        lambda: self._adapt_grad_clip_target_val,
                        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
                    self._grads, self._grads_norm = tf.clip_by_global_norm(
                        self._grads, thresh)

                apply_grad_op = self._optimizer.apply_gradients(
                    zip(self._grads, self._tvars), global_step, name)

        with tf.control_dependencies([apply_grad_op]):
            self._increment_global_step_op = tf.assign(self._global_step,
                                                       self._global_step + 1)

            self._adapt_grad_clip_thresh_op = \
              tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
            self._adapt_grad_clip_target_val_op = \
              tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
            # self._adapt_grad_clip_target_val_op = \
            #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))

        return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
                        self._adapt_grad_clip_thresh_op,
                        self._adapt_grad_clip_target_val_op,
                        self._increment_global_step_op)
示例#21
0
    def on_epoch_end(self, epoch, logs={}):
        '''computing token error'''

        cur_session = K.get_session()
        target_seq_list, predict_seq_list = [], []

        is_py_sequence = True
        if isinstance(self.eval_ds,
                      (dataset_ops.DatasetV2, dataset_ops.DatasetV1)):
            eval_gen = self.eval_ds.make_one_shot_iterator()
            self.next_batch_gen = eval_gen.get_next()[0]
            is_py_sequence = False
        elif isinstance(self.eval_ds,
                        (iterator_ops.IteratorV2, iterator_ops.Iterator)):
            self.next_batch_gen = self.ds.get_next()[0]
            is_py_sequence = False

        for index in range(len(self.eval_task)):
            batch_data = None
            if is_py_sequence:
                batch_data = self.eval_ds[index][0]
            else:
                batch_data = cur_session.run(self.next_batch_gen)
            batch_input = batch_data['inputs']
            batch_target = batch_data['targets'].tolist()
            batch_predict = self.func(batch_input)[0]

            if self.decoder_type == 'argmax':
                predict_seq_list += py_ctc.ctc_greedy_decode(batch_predict,
                                                             0,
                                                             unique=True)
            else:
                sequence_lens = [
                    len(pre_sequence) for pre_sequence in batch_predict
                ]
                batch_decoder, _ = tf_ctc.ctc_beam_search_decode(
                    tf.constant(batch_predict),
                    tf.constant(sequence_lens),
                    beam_width=3,
                    top_paths=3)
                predict_seq_list += cur_session.run(batch_decoder)[0].tolist()
            target_seq_list += batch_target

        val_token_errors = metrics_lib.token_error(
            predict_seq_list=predict_seq_list,
            target_seq_list=target_seq_list,
            eos_id=0)
        logs['val_token_err'] = val_token_errors

        if 'val_loss' in logs:
            logging.info("Epoch {}: on eval, val_loss is {}.".format(
                epoch + 1, logs['val_loss']))
        logging.info("Epoch {}: on eval, token_err is {}.".format(
            epoch + 1, val_token_errors))
        logging.info("Epoch {}: loss on train is {}".format(
            epoch + 1, logs['loss']))
示例#22
0
    def test_ctc_beam_search_decode(self):
        ''' ctc tensorflow beam search unittest'''

        with self.cached_session():
            decode_result, _ = tf_ctc.ctc_beam_search_decode(
                tf.constant(self.logits),
                tf.constant(self.sequence_lens),
                beam_width=1,
                top_paths=1)
            self.assertAllEqual(decode_result[0].eval(), [[1], [1]])
示例#23
0
    def test_logits_blankid_to_last(self):
        ''' unit test case for the logits_blankid_to_last interface '''
        with self.cached_session():

            with self.assertRaises(ValueError) as valueErr:
                logits = ctc_utils.logits_blankid_to_last(logits=tf.constant(
                    self.logits),
                                                          blank_index=10)
            the_exception = valueErr.exception
            self.assertEqual(
                str(the_exception),
                'blank_index must be less than or equal to num_class - 1')

            logits = ctc_utils.logits_blankid_to_last(logits=tf.constant(
                self.logits),
                                                      blank_index=0)
            logits_transform = np.asarray([
                [[
                    0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
                    0.633766
                ],
                 [
                     0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436,
                     0.111121
                 ],
                 [
                     0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688,
                     0.0357786
                 ],
                 [
                     0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533,
                     0.0663296
                 ],
                 [
                     0.196634, 0.123377, 0.50648837, 0.00903441, 0.00623107,
                     0.158235
                 ]],
                [[0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, 0.30176],
                 [0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, 0.24082],
                 [
                     0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456,
                     0.230246
                 ],
                 [
                     0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345,
                     0.280884
                 ],
                 [
                     0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046,
                     0.423286
                 ]]
            ],
                                          dtype=np.float32)
            self.assertAllClose(logits.eval(), logits)
示例#24
0
文件: utils.py 项目: youisbaby/delta
 def pad_tail_tensor():
     if dim == 1:
         shape = tf.constant([1, 2])
         indices = tf.constant([[0, 1]])
     else:
         shape = tf.constant([2, 2])
         indices = tf.constant([[1, 1]])
     updates = [new_length - cur_length]
     paddings = tf.scatter_nd(indices, updates, shape)
     new_t = tf.pad(origin_t,
                    paddings,
                    "CONSTANT",
                    constant_values=padding_token)
     return new_t
示例#25
0
  def test_crf_loss(self):
    ''' test crf loss '''
    with self.cached_session():
      loss_true = np.float32(5.5096426)
      logits = np.asarray([[[0.3, 0.4, 0.3], [0.1, 0.9, 0.0], [0.2, 0.7, 0.1],
                            [0.3, 0.2, 0.5], [0.6, 0.2, 0.2]]],
                          dtype=np.float32)  # [1,5,3]
      trans_params = tf.fill([3, 3], 0.5, name='trans_params')
      labels = np.asarray([[0, 1, 2, 0, 1]], dtype=np.int32)  # shape=[1,5]
      sequence_lengths = np.asarray([5], dtype=np.int32)  # shape=[1,]
      loss, _ = loss_utils.crf_log_likelihood(
          tf.constant(logits), tf.constant(labels),
          tf.constant(sequence_lengths), trans_params)

      self.assertEqual(loss.eval(), loss_true)
示例#26
0
 def get_lr_tensor(self):
     lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
     lr = tf.minimum(
         lr,
         lr * (tf.to_float(self._global_step) + 1.0) / 10.0 /
         tf.to_float(tf.constant(self._curv_win_width)))
     return lr
示例#27
0
 def grad_variance(self):
     grad_var_ops = []
     tensor_to_avg = []
     for t, g in zip(self._tvars, self._grads):
         if isinstance(g, ops.IndexedSlices):
             tensor_to_avg.append(
                 tf.reshape(tf.unsorted_segment_sum(g.values, g.indices,
                                                    g.dense_shape[0]),
                            shape=t.get_shape()))
         else:
             tensor_to_avg.append(g)
     avg_op = self._moving_averager.apply(tensor_to_avg)
     grad_var_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_avg = [
             self._moving_averager.average(val) for val in tensor_to_avg
         ]
         self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
     self._grad_var = tf.maximum(
         tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
         self._grad_norm_squared_avg -
         tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared]))
     if self._sparsity_debias:
         self._grad_var *= self._sparsity_avg
     return grad_var_ops
示例#28
0
    def test_delta_delta(self):
        ''' test add delta detlas '''
        #pylint: disable=invalid-name
        p = tffeat.speech_params(sr=self.sr_true,
                                 bins=40,
                                 cmvn=False,
                                 audio_desired_samples=1000,
                                 add_delta_deltas=False)

        with self.cached_session(use_gpu=False, force_gpu=False):
            wavfile = tf.constant(self.wavpath)
            audio, sample_rate = tffeat.read_wav(wavfile, self.hp)
            del sample_rate

            feature = tffeat.compute_mel_filterbank_features(
                audio,
                sample_rate=p.audio_sample_rate,
                preemphasis=p.audio_preemphasis,
                frame_length=p.audio_frame_length,
                frame_step=p.audio_frame_step,
                lower_edge_hertz=p.audio_lower_edge_hertz,
                upper_edge_hertz=p.audio_upper_edge_hertz,
                num_mel_bins=p.audio_num_mel_bins,
                apply_mask=False)

            feature = tffeat.delta_delta(feature, order=2)
            self.assertEqual(feature.eval().shape, (11, 40, 3))
示例#29
0
def compute_mfcc():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['output_type'] = args.output_type
    config['window_type'] = args.window_type
    config['snip_edges'] = args.snip_edges
    config['preeph_coeff'] = args.preeph_coeff
    config['remove_dc_offset'] = args.remove_dc_offset
    config['is_fbank'] = args.is_fbank
    config['cepstral_lifter'] = args.cepstral_lifter
    config['coefficient_count'] = args.coefficient_count

    mfcc = Mfcc.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate))
            sess = tf.Session()
            mfcc_feats = mfcc_test.eval(session=sess)
            writer[utt_id] = mfcc_feats
示例#30
0
def compute_spectrum():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['output_type'] = int(args.output_type)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length

    spectrum = Spectrum.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            spectrum_test = spectrum(audio_data, args.sample_rate)
            sess = tf.compat.v1.Session()
            spectrum_feats = spectrum_test.eval(session=sess)
            writer[utt_id] = spectrum_feats