def decode( inputs, memory, is_training = True, scope = 'decoder_layers', reuse = None ): with tf.variable_scope(scope, reuse = reuse): dec = prenet(inputs, is_training = is_training) dec = attention_decoder(dec, memory, embed_size) dec += gru(dec, embed_size, False, scope = 'gru1') dec += gru(dec, embed_size, False, scope = 'gru2') return tf.layers.dense(dec, len(char2idx))
def embedding(self, x, is_training=False): """ :param x: shape=(n, t, n_mels) :return: embedding. shape=(n, e) """ # frame-level embedding x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu) # (n, t, h) out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, self.hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, self.hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(self.num_highway): out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) out = gru(out, self.hidden_units, False) # (n, t, h) # take the last output out = out[..., -1] # (n, h) # embedding out = tf.layers.dense(out, self.num_classes, name='projection') # (n, e) out = tf.identity(out, name="embedding") return out
def decode2(inputs, is_training=True, scope="decoder2", reuse=None): ''' Args: inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Log magnitude spectrogram of sound files. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted magnitude spectrogram tensor with shape of [N, T', C''], where C'' = (1+hp.n_fft//2)*hp.r. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T'', E/2) # Decoder Post-processing net = CBHG ## Conv1D bank dec = mod.conv1d_banks(prenet_out, K=hp.decoder_num_banks, is_training=is_training) # (N, T', E*K/2) ## Max pooling dec = tf.layers.max_pooling1d(dec, 2, 1, padding="same") # (N, T', E*K/2) ## Conv1D projections dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1") # (N, T', E) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") dec = mod.conv1d(dec, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T', E/2) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") dec += prenet_out ## Highway Nets for i in range(4): dec = mod.highwaynet( dec, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ## Bidirectional GRU dec = mod.gru(dec, hp.embed_size // 2, True) # (N, T', E) # Outputs => (N, T', (1+hp.n_fft//2)*hp.r) out_dim = (1 + hp.n_fft // 2) * hp.r outputs = tf.layers.dense(dec, out_dim) return outputs
def decode1(decoder_inputs, memory, is_training=True, scope="decoder1", reuse=None): ''' Args: decoder_inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Shifted melspectrogram of sound files. memory: A 3d tensor with shape of [N, T, C], where C=hp.embed_size. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted melspectrogram tensor with shape of [N, T', C']. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net #ipdb.set_trace() dec = mod.prenet(decoder_inputs, is_training=is_training) # (N, T', E/2) # Attention RNN dec = mod.attention_decoder(dec, memory, num_units=hp.embed_size) # (N, T', E) # Decoder RNNs dec += mod.gru(dec, hp.embed_size, False, scope="decoder_gru1") # (N, T', E) dec += mod.gru(dec, hp.embed_size, False, scope="decoder_gru2") # (N, T', E) # Outputs => (N, T', hp.n_mels*hp.r) out_dim = decoder_inputs.get_shape().as_list()[-1] outputs = tf.layers.dense( dec, out_dim) # (N, None, E) output the same shape as input return outputs
def encode(inputs, is_training = True, scope = 'encoder', reuse = None): with tf.variable_scope(scope, reuse = reuse): prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training) enc = conv1d_banks( prenet_out, K = encoder_num_banks, is_training = is_training ) enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same') enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc += prenet_out for i in range(num_highway_blocks): enc = highwaynet( enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i) ) memory = gru(enc, embed_size // 2, True) return memory
def init_inference(self, config, is_training=False): num_banks = config['num_banks'] hidden_units = config['hidden_units'] num_highway = config['num_highway'] norm_type = config['norm_type'] batch_size = config['batch_size'] num_rnn_layer = config['num_rnn_layer'] self._input_dim = input_dim = config['input_dim'] self._output_dim = output_dim = config['alphabet_size'] self._inputs = tf.placeholder(tf.float32, [batch_size, None, input_dim]) self._seq_lens = tf.placeholder(tf.int32, shape=batch_size) self._out_lens = self._seq_lens # TODO, awni, for now on the client to remember to initialize these. self._mean = tf.get_variable("mean", shape=input_dim, trainable=False) self._std = tf.get_variable("std", shape=input_dim, trainable=False) std_inputs = (self._inputs - self._mean) / self._std x = conv1d(self._inputs, hidden_units, 1, scope="conv1d") out = conv1d_banks(x, K=num_banks, num_units=hidden_units, norm_type=norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(num_highway): out = highwaynet(out, num_units=hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) rnn_out, state, initial_state = gru( out, hidden_units, False, seqlens=self._seq_lens, num_layers=num_rnn_layer, is_training=is_training) # (n, t, h) self._initial_state = initial_state self._rnn_state = state rnn_out = tf.transpose(rnn_out, [1, 0, 2]) # Collapse time and batch dims pre softmax. rnn_out = tf.reshape(rnn_out, (-1, hidden_units)) logits, probas = _add_softmax_linear( rnn_out, hidden_units, output_dim, initializer=tf.contrib.layers.xavier_initializer()) # Reshape to time-major. self._logits = tf.reshape(logits, (-1, batch_size, output_dim)) self._probas = tf.reshape(probas, (-1, batch_size, output_dim)) self._init_inference = True
def encode(inputs, is_training=True, scope="encoder", reuse=None): ''' Args: inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size T: real length seqlens: A 1d tensor with shape of [N,], dtype of int32. masks: A 3d tensor with shape of [N, T, 1], dtype of float32. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: E is the spectrogram filter N A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation ''' with tf.variable_scope(scope, reuse=reuse): # Load vocabulary #char2idx, idx2char = load_vocab() # Character Embedding N seqs #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256) # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout #ipdb.set_trace() inputs = mod.pre_spectro(inputs, is_training=is_training) # (N, T, E) prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = mod.conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2) ### Max pooling enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = mod.highwaynet( enc, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ### Bidirectional GRU---apply nonlineararity memory = mod.gru( enc, hp.embed_size // 2, False ) # (N, T, E) what the network represent the input text input return memory