예제 #1
0
def extract_feature(waveforms, params):
    '''extract fbank with delta-delta and do cmvn
     waveforms: [batch, samples]
  '''
    p = params
    with tf.variable_scope('feature_extractor'):
        mel_fbanks = extract_logfbank_with_delta(waveforms, params)
        # shape: [1, nframes, nbins, nchannels]
        fbank_size = utils.shape_list(mel_fbanks)
        #assert fbank_size[0] == 1

        # This replaces CMVN estimation on data
        if not p.audio_global_cmvn:
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
        else:
            assert p.audio_cmvn_path, p.audio_cmvn_path
            mean, variance = utils.load_cmvn(p.audio_cmvn_path)

        var_epsilon = 1e-09
        mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon)

        # Later models like to flatten the two spatial dims. Instead, we add a
        # unit spatial dim and flatten the frequencies and channels.
        batch_size = fbank_size[0]
        feats = tf.concat([
            tf.reshape(
                mel_fbanks,
                [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]),
            tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2],
                      fbank_size[3]))
        ], 1)
    return feats  # shape [batch_size, nframes, featue_size, chnanels]
예제 #2
0
    def se_moudle(self, x, channels, reduction, name=''):
        input_t = x
        x = tf.reduce_mean(x, [1, 2], name=name + '_avg', keep_dims=True)
        x = tf.layers.conv2d(
            x,
            channels // reduction, (1, 1),
            use_bias=False,
            name=name + '_1x1_down',
            strides=(1, 1),
            padding='valid',
            data_format='channels_last',
            activation=None,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.zeros_initializer())
        x = tf.nn.relu(x, name=name + '_1x1_down_relu')

        x = tf.layers.conv2d(
            x,
            channels, (1, 1),
            use_bias=False,
            name=name + '_1x1_up',
            strides=(1, 1),
            padding='valid',
            data_format='channels_last',
            activation=None,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.zeros_initializer())
        x = tf.nn.sigmoid(x, name=name + '_1x1_up_sigmoid')
        return tf.multiply(input_t, x, name=name + '_mul')
예제 #3
0
def accuracy(logits, labels):
    ''' accuracy candies
  params:
    logits: [B, ..., D]
    labels: [B, ...]
  return:
    accuracy tensor
  '''
    with tf.name_scope('accuracy'):
        assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1)
        assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels))
        with tf.control_dependencies([assert_rank, assert_shape]):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
            labels = tf.cast(labels, tf.int64)
            return tf.reduce_mean(
                tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
예제 #4
0
def crf_log_likelihood(tags_scores, labels, input_length, transitions):
    '''
  :param tags_scores:  [batch_size, max_seq_len, num_tags]
  :param labels:  [batch_size, max_seq_len]
  :param input_length:  [batch_size,]
  :param transitions: [num_tags, num_tags]
  :return: loss, transition_params
  '''
    log_likelihood, transition_params = tfa.text.crf_log_likelihood(
        inputs=tags_scores,
        tag_indices=labels,
        sequence_lengths=input_length,
        transition_params=transitions)

    loss = tf.reduce_mean(-log_likelihood)

    return loss, transition_params
예제 #5
0
 def get_loss(self):
     ''' dummy ctc loss, since ctc is implemented as a kearas layer '''
     loss = {'ctc': lambda y_true, y_pred: tf.reduce_mean(y_pred)}
     return loss