def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_left = inputs["input_x_left"] input_right = inputs["input_x_right"] embedding = self.embed embed_left = embedding(input_left) embed_right = embedding(input_right) encoded_left = self.lstm_left(embed_left) encoded_right = self.lstm_right(embed_right) encoded_right = tf.transpose(encoded_right, [0, 2, 1]) left_right_sim = tf.matmul(encoded_left, encoded_right) shape_list = left_right_sim.get_shape() newdim = shape_list[1] * shape_list[2] sim_matrix = tf.reshape(left_right_sim, [-1, newdim], name="sim_matrix") dropout = self.dropout(sim_matrix) out = self.outlayer(dropout) scores = self.final_dense(out) return scores
def call(self, audio_data, sample_rate=None): """ Caculate pitch features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech. """ p = self.config with tf.name_scope('pitch'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=float) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float)) with tf.control_dependencies([assert_op]): pitch = py_x_ops.pitch(audio_data, sample_rate, window_length=p.window_length, frame_length=p.frame_length, thres_autoc=p.thres_autoc) pitch = tf.squeeze(pitch) pitch = tf.transpose(pitch[None, :]) return pitch
def _dpool_index(one_length_left, one_length_right, fixed_length_left, fixed_length_right): logging.info("fixed_length_left: {}".format(fixed_length_left)) logging.info("fixed_length_right: {}".format(fixed_length_right)) if one_length_left == 0: stride_left = fixed_length_left else: stride_left = 1.0 * fixed_length_left / tf.cast( one_length_left, dtype=tf.float32) if one_length_right == 0: stride_right = fixed_length_right else: stride_right = 1.0 * fixed_length_right / tf.cast( one_length_right, dtype=tf.float32) one_idx_left = [ tf.cast(i / stride_left, dtype=tf.int32) for i in range(fixed_length_left) ] one_idx_right = [ tf.cast(i / stride_right, dtype=tf.int32) for i in range(fixed_length_right) ] mesh1, mesh2 = tf.meshgrid(one_idx_left, one_idx_right) index_one = tf.transpose(tf.stack([mesh1, mesh2]), (2, 1, 0)) return index_one
def splice(feat, left_context, right_context): ''' splice frame with context param: feat, tf.float32, [batch, time, feat] return: feat, tf.float32, [batch, time, feat*(left_context + 1 + right_context)] reference: https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6 ''' def _loop_continue(time, end_time, context, unused_left_context, right_context, unused_output_tas): del unused_output_tas del unused_left_context return time < end_time def _loop_body(time, end_time, context, left_context, right_context, output_tas): shape = tf.shape(context) B, _, D = shape[0], shape[1], shape[2] N = (1 + left_context + right_context) * D new_feat = context[:, time:time + left_context + 1 + right_context, :] new_feat = tf.reshape(new_feat, [B, N]) new_output_tas = output_tas.write(time, new_feat) return (time + 1, end_time, context, left_context, right_context, new_output_tas) with tf.control_dependencies([ tf.assert_greater_equal(left_context, 0), tf.assert_greater_equal(right_context, 0) ]): T = tf.shape(feat)[1] output_tas = _new_tensor_array('splice_feat_ta', T, dtype=tf.float32) time = tf.constant(0, tf.int32) first = tf.tile(feat[:, 0:1, :], [1, left_context, 1]) last = tf.tile(feat[:, -1:, :], [1, right_context, 1]) context = tf.concat([first, feat], axis=1) context = tf.concat([context, last], axis=1) loop_vars = (time, T, context, left_context, right_context, output_tas) parallel_iterations = 10 shape_invariants = tf.nest.map_structure( lambda t: tf.TensorShape(None), loop_vars) (time, end_time, context, left_context, right_context, output_tas) = tf.while_loop(_loop_continue, _loop_body, loop_vars=loop_vars, shape_invariants=shape_invariants, parallel_iterations=parallel_iterations, swap_memory=False) del context del left_context del right_context batch_spliced_feats = output_tas.stack() batch_spliced_feats = tf.transpose(batch_spliced_feats, [1, 0, 2]) return batch_spliced_feats
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def split_heads(self, x, batch_size): """ Split hidden_size into depth(hidden_size // num_heads) for multi-head attention. Args: x: (batch_size, seq_len_x, hidden_size) batch_size Returns: split_x: (batch_size, num_heads, seq_len_x, depth) """ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) split_x = tf.transpose(x, perm=[0, 2, 1, 3]) return split_x
def call(self, inputs, training=None, mask=None): """ The implementation of Multi-headed attention. Args: inputs = (v, k, q) q: (batch_size, seq_len_q, hidden_size) k: (batch_size, seq_len_k, hidden_size) v: (batch_size, seq_len_v, hidden_size) mask: (batch_size, seq_len_q, seq_len_k) Returns: output: (batch_size, seq_len_q, hidden_size) attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k) """ q, k, v = inputs batch_size = tf.shape(q)[0] q = self.wq(q) # (batch_size, seq_len_q, hidden_size) k = self.wk(k) # (batch_size, seq_len_k, hidden_size) v = self.wv(v) # (batch_size, seq_len_v, hidden_size) q = self.split_heads( q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads( k, batch_size) # (batch_size, num_heads, seq_len_k, depth) v = self.split_heads( v, batch_size) # (batch_size, num_heads, seq_len_v, depth) # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) scaled_attention, attention_weights = self.scaled_dot_product_attention( q, k, v, mask) scaled_attention = tf.transpose( scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) concat_attention = tf.reshape( scaled_attention, (batch_size, -1, self.hidden_size)) # (batch_size, seq_len_q, hidden_size) output = self.dense( concat_attention) # (batch_size, seq_len_q, hidden_size) return output, attention_weights
def splice_layer(x, name, context): ''' Splice a tensor along the last dimension with context. e.g.: t = [[[1, 2, 3], [4, 5, 6], [7, 8, 9]]] splice_tensor(t, [0, 1]) = [[[1, 2, 3, 4, 5, 6], [4, 5, 6, 7, 8, 9], [7, 8, 9, 7, 8, 9]]] Args: tensor: a tf.Tensor with shape (B, T, D) a.k.a. (N, H, W) context: a list of context offsets Returns: spliced tensor with shape (..., D * len(context)) ''' with tf.variable_scope(name): input_shape = tf.shape(x) B, T = input_shape[0], input_shape[1] context_len = len(context) array = tf.TensorArray(x.dtype, size=context_len) for idx, offset in enumerate(context): begin = offset end = T + offset if begin < 0: begin = 0 sliced = x[:, begin:end, :] tiled = tf.tile(x[:, 0:1, :], [1, abs(offset), 1]) final = tf.concat((tiled, sliced), axis=1) else: end = T sliced = x[:, begin:end, :] tiled = tf.tile(x[:, -1:, :], [1, abs(offset), 1]) final = tf.concat((sliced, tiled), axis=1) array = array.write(idx, final) spliced = array.stack() spliced = tf.transpose(spliced, (1, 2, 0, 3)) spliced = tf.reshape(spliced, (B, T, -1)) return spliced
def pooling_layer(self, x, time_len): ''' pooling layer''' with tf.variable_scope('time_pooling'): if self.attention: x, self.alphas = common_layers.attention( x, self.netconf['attention_size'], return_alphas=True) #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1] tf.summary.image( 'alignment', tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3])) else: if self.netconf['use_lstm_layer']: x = tf.concat(x, 2) # [batch, seq_len, dim, 1] x = tf.expand_dims(x, axis=-1) seq_len = time_len x = common_layers.max_pool(x, ksize=[seq_len, 1], strides=[seq_len, 1]) if self.netconf['use_lstm_layer']: x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']]) else: x = tf.reshape(x, [-1, self.netconf['linear_num']]) return x
def call(self, inputs, training=None, mask=None): query, key, value = self._unpack(inputs) query_mask, key_mask, _ = self._unpack(mask) batch_size = tf.shape(query)[0] dimension_query = query.get_shape().as_list()[-1] seq_len = tf.shape(query)[-2] key_len = tf.shape(key)[-2] feature_dim = tf.shape(value)[-1] query = tf.matmul( query, tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1])) key = tf.matmul( key, tf.tile(tf.expand_dims(self.kernel_key, 0), [batch_size, 1, 1])) value = tf.matmul( value, tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1])) if self.use_bias: query += self.b_query key += self.b_key value += self.b_value def _reshape_multihead(origin_input): """ reshape for multi head Input shape: (Batch size, steps, features) Output shape: (Batch size * head num, steps, features // head num) """ return tf.concat(tf.split(origin_input, self.head_num, axis=2), axis=0) def _reshape_mask(mask): """ repeat mask for multi head Input shape: (Batch size, steps) Output shape: (Batch size * head num, steps) """ if mask is None: return None seq_len = tf.shape(mask)[1] mask = tf.expand_dims(mask, axis=1) mask = tf.tile(mask, [1, self.head_num, 1]) return tf.reshape(mask, shape=(-1, seq_len)) query_ = _reshape_multihead(query) key_ = _reshape_multihead(key) value_ = _reshape_multihead(value) key_mask = _reshape_mask(key_mask) # (Batch size * head num, query steps, key steps) similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1])) # scale similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32)) if self.sequence_mask: ones = tf.ones((seq_len, key_len)) similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9 if key_mask is not None: similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2), tf.float32)) * 1e9 attention_weights = tf.keras.activations.softmax(similaritys) attention_outputs = tf.matmul(attention_weights, value_) attention_outputs = tf.reshape( attention_outputs, (-1, self.head_num, seq_len, feature_dim // self.head_num)) attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3]) attention_outputs = tf.reshape(attention_outputs, (-1, seq_len, feature_dim)) attention_outputs = tf.matmul( attention_outputs, tf.tile(tf.expand_dims(self.kernel_project, 0), [batch_size, 1, 1])) if self.use_bias: attention_outputs += self.b_project if self.activation is not None: attention_outputs = self.activation(attention_outputs) if query_mask is not None: attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1), tf.float32) return attention_outputs