def __init__(self, nuser, nloc, ntime, nquadkey, user_dim, loc_dim, time_dim, reg_dim, nhid, nhead_enc, nhead_dec, nlayers, dropout=0.5, **extra_config): super(QuadKeyLocPredictor, self).__init__() self.emb_user = embedding(nuser, user_dim, zeros_pad=True, scale=True) self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True) self.emb_reg = embedding(nquadkey, reg_dim, zeros_pad=True, scale=True) self.emb_time = embedding(ntime, time_dim, zeros_pad=True, scale=True) ninp = user_dim pos_encoding = extra_config.get("position_encoding", "transformer") if pos_encoding == "embedding": self.pos_encoder = PositionalEmbedding(loc_dim + reg_dim, dropout) elif pos_encoding == "transformer": self.pos_encoder = PositionalEncoding(loc_dim + reg_dim, dropout) self.enc_layer = TransformerEncoderLayer(loc_dim + reg_dim, nhead_enc, loc_dim + reg_dim, dropout) self.encoder = TransformerEncoder(self.enc_layer, nlayers) self.region_pos_encoder = PositionalEmbedding(reg_dim, dropout, max_len=20) self.region_enc_layer = TransformerEncoderLayer(reg_dim, 1, reg_dim, dropout=dropout) self.region_encoder = TransformerEncoder(self.region_enc_layer, 2) if not extra_config.get("use_location_only", False): if extra_config.get("embedding_fusion", "multiply") == "concat": if extra_config.get("user_embedding", False): self.lin = nn.Linear(user_dim + loc_dim + reg_dim + time_dim, ninp) else: self.lin = nn.Linear(loc_dim + reg_dim, ninp) ident_mat = torch.eye(ninp) self.register_buffer('ident_mat', ident_mat) self.layer_norm = nn.LayerNorm(ninp) self.extra_config = extra_config self.dropout = dropout
def __init__(self, nuser, nloc, ntime, nreg, user_dim, loc_dim, time_dim, reg_dim, nhid, nhead_enc, nhead_dec, nlayers, dropout=0.5, **extra_config): super(LocPredictor, self).__init__() self.emb_user = embedding(nuser, user_dim, zeros_pad=True, scale=True) self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True) self.emb_reg = embedding(nreg, reg_dim, zeros_pad=True, scale=True) self.emb_time = embedding(ntime, time_dim, zeros_pad=True, scale=True) if not ((user_dim == loc_dim) and (user_dim == time_dim) and (user_dim == reg_dim)): raise Exception('user, location, time and region should have the same embedding size') ninp = user_dim pos_encoding = extra_config.get("position_encoding", "transformer") if pos_encoding == "embedding": self.pos_encoder = PositionalEmbedding(ninp, dropout) elif pos_encoding == "transformer": self.pos_encoder = PositionalEncoding(ninp, dropout) self.enc_layer = TransformerEncoderLayer(ninp, nhead_enc, nhid, dropout) self.encoder = TransformerEncoder(self.enc_layer, nlayers) if not extra_config.get("use_location_only", False): if extra_config.get("embedding_fusion", "multiply") == "concat": if extra_config.get("user_embedding", False): self.lin = nn.Linear(user_dim + loc_dim + reg_dim + time_dim, ninp) else: self.lin = nn.Linear(loc_dim + reg_dim + time_dim, ninp) ident_mat = torch.eye(ninp) self.register_buffer('ident_mat', ident_mat) self.layer_norm = nn.LayerNorm(ninp) self.extra_config = extra_config self.dropout = dropout
def build_embedding_layer(self, inputs, reuse=None): self.emb_char = embedding(inputs, vocab_size=self.vocab_size, num_units=self.hidden_units, scale=True, scope="emb_char", reuse=reuse) self.emb_char_pos = self.emb_char if self.emb_pos_type == 'sin': self.emb_char_pos += positional_encoding(inputs, num_units=self.hidden_units, zero_pad=False, scale=False, scope="emb_pos", reuse=reuse) else: self.emb_char_pos += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(inputs)[1]), 0), [tf.shape(inputs)[0], 1]), vocab_size=self.maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="emb_pos", reuse=reuse) self.emb = tf.layers.dropout(self.emb_char_pos, rate=self.dropout,) return self.emb
def train(self): self.text, self.refer_mel, self.mel, self.linear = get_next_batch() self.encoder_inputs = embedding(self.text, scope='embedding', reuse=self.reuse) self.decoder_inputs = tf.concat((tf.zeros_like(self.mel[:, :1, :]), self.mel[:, :-1, :]), 1) self.decoder_inputs = self.decoder_inputs[:, :, -hp.N_MELS:] with tf.variable_scope(self.scope_name): self.text_outputs = encoder(self.encoder_inputs, is_training=self.is_training) self.vae_outputs, self.mu, self.log_var = vae(self.refer_mel, is_training=self.is_training) self.encoder_outputs = self.text_outputs + self.vae_outputs self.mel_hat, self.alignments = decoder(self.decoder_inputs, self.encoder_outputs, is_training=self.is_training) self.linear_hat = postnet(self.mel_hat, is_training=self.is_training) if self.mode in ['train', 'eval']: self.global_step = tf.get_variable('global_step', initializer=0, dtype=tf.int32, trainable=False) self.lr = tf.train.exponential_decay(learning_rate=hp.LR, global_step=self.global_step, decay_steps=hp.DECAY_STEPS, decay_rate=hp.DECAY_RATE) self.optimizer = tf.train.AdamOptimizer(self.lr) self.mel_loss = tf.reduce_mean(tf.abs(self.mel_hat - self.mel)) self.linear_loss = tf.reduce_mean(tf.abs(self.linear_hat - self.linear)) self.kl_loss = - 0.5 * tf.reduce_sum(1 + self.log_var - tf.pow(self.mu, 2) - tf.exp(self.log_var)) self.vae_loss_weight = control_weight(self.global_step) self.loss = self.mel_loss + self.linear_loss + self.vae_loss_weight * self.kl_loss self.
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, nloc, loc_dim, num_layers=1, dropout=0.0): super(GRU4Rec, self).__init__() self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True) self.encoder = torch.nn.GRU(input_size=loc_dim, hidden_size=loc_dim, num_layers=num_layers, dropout=dropout) self.h_0 = nn.Parameter(torch.randn((num_layers, 1, loc_dim), requires_grad=True))
def __init__(self, d_model, dropout=0.1, max_len=120): super(PositionalEmbedding, self).__init__() self.pos_emb_table = embedding(max_len, d_model, zeros_pad=False, scale=False) pos_vector = torch.arange(max_len) self.dropout = nn.Dropout(p=dropout) self.register_buffer('pos_vector', pos_vector)
def single_model(self, gpu_id): if self.mode == "train" or self.mode == "eval": inputs_transcript = self.inputs_transcript[gpu_id] inputs_reference = self.inputs_reference[gpu_id] inputs_ref_lens = self.inputs_ref_lens[gpu_id] inputs_speaker = self.inputs_speaker[gpu_id] inputs_decoder = self.inputs_decoder[gpu_id] training = True if self.mode == "train" else False # Encoder # transcript encoder text = modules.transcript_encoder( inputs=inputs_transcript, embed_size=Hp.charac_embed_size, K=Hp.num_encoder_banks, highway_layers=Hp.num_enc_highway_layers, training=training) # outputs: [Batch_size, Text length, 256] text = tf.identity(text, name="text_enc") # reference encoder if self.mode == "train": batch_size = Hp.train_batch_size // Hp.num_gpus elif self.mode == "eval": batch_size = Hp.eval_batch_size // Hp.num_gpus else: batch_size = Hp.synthes_batch_size // HP.num_gpus inputs_reference_reshape = tf.reshape(inputs_reference, [batch_size, -1, Hp.num_mels]) # expand the dims inputs_reference [batch, Ty, n_mels] from 3 to 4 for conv2d [batch,Ty, n_mels, 1] inputs_reference_reshape = tf.expand_dims(inputs_reference_reshape, -1) prosody = modules.reference_encoder(inputs=inputs_reference_reshape, training=training) #[batch, 128] prosody = tf.expand_dims(prosody, 1) #[batch, 1 ,128] #[batch, Tx, 128] replicate prosody for all Tx steps prosody = tf.tile(prosody, [1, Hp.num_charac, 1], name="prosody_enc") # speaker speaker = modules.embedding( inputs=inputs_speaker, charac_size=Hp.num_speakers, embed_size=Hp.speaker_embed_size, scope="speaker") # [batch, 1, speaker_embed_size] [32,1,16] speaker = tf.tile(speaker, [1, Hp.num_charac, 1], name="speaker_embed") memory = tf.concat([text, prosody, speaker], axis=-1, name="memory") # [batch, Tx, Dt+Ds+Dp ] #self.memory.append(memory) # Spectrogrom Decoder # we concat f0 frame and remove the last frame of original melspectrogrom since it will not be sent to the deconder if self.mode == "train": inputs_decoder = tf.concat((tf.zeros_like( inputs_decoder[:, :1, :]), inputs_decoder[:, :-1, :]), 1) #[batch, Ty/r, num_mels*r] mel_hat, alignments = modules.attention_gru_decoder( inputs=inputs_decoder, inputs_lengths=inputs_ref_lens, memory=memory, attention_rnn_nodes=Hp.num_attention_nodes, decoder_rnn_nodes=Hp.num_decoder_nodes, num_mels=Hp.num_mels, reduction_factor=Hp.reduction_factor, max_iters=self.max_len_per_batch, training=training) #[batch, Ty/r, num_mels*r] alignments = tf.identity(alignments, name="alignments") mel_hat = tf.identity(mel_hat, name="melspectrogrom_pred") mag_hat = modules.cbhg_postprocessing( inputs=mel_hat, num_mels=Hp.num_mels, num_fft=Hp.num_fft, K=Hp.num_post_banks, highway_layers=Hp.num_post_highway_layers, training=training) # [batch, Ty, 1+n_fft//2] mag_hat = tf.identity(mag_hat, name="magnitude_pred") #wavform = tf.py_func(signal_process.Spectrogrom2Wav, [mag_hat[0]], tf.float32, name = "wavform") # generate a sample to listen to return mel_hat, alignments, mag_hat
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: cells = self.rnn_cell() encoder_output, _encoder_state = tf.nn.dynamic_rnn( cells, self.enc, sequence_length=self.x_len, dtype=tf.float32) self.enc = tf.concat([self.enc, encoder_output], axis=-1) self.enc = tf.layers.dense(self.enc, hp.emb_dim, activation="relu") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): pos_emb = tf.get_variable( 'enc_pos_emb', dtype=tf.float32, shape=[self.enc.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, pos_emb=pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: cells = self.rnn_cell() decoder_output, _decoder_state = tf.nn.dynamic_rnn( cells, self.dec, sequence_length=self.y_len, dtype=tf.float32) self.dec = tf.concat([self.dec, decoder_output], axis=-1) self.dec = tf.layers.dense(self.dec, hp.emb_dim, activation="relu") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): dec_dec_pos_emb = tf.get_variable( 'dec_de_pos_emb', dtype=tf.float32, shape=[self.dec.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) dec_enc_pos_emb = tf.get_variable( 'dec_enc_pos_emb', dtype=tf.float32, shape=[self.enc.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, pos_emb=dec_dec_pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, pos_emb=dec_enc_pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): #paceholders for inputs and outputs B, N, M, C = param.batch_size, param.max_context_words, param.max_question_words, param.max_chars #get inputs and outputs self.x_c_w, self.x_c_c, self.x_q_w, self.x_q_c, self.y = my.get_batch_data() ''' #can also use placeholders as below if needed: #input sequence of word vocabulary indices of the context self.x_c_w = tf.placeholder(tf.int32, shape=[B, N], name="context_words") #input sequence of char vocabulary indices (0 to 25) of the words of the context self.x_c_c = tf.placeholder(tf.int32, shape=[B, N, C], name="context_word_chars") #input sequence of question vocabulary indices of the context self.x_q_w = tf.placeholder(tf.int32, shape=[B, M], name="question_words") #input sequence of char vocabulary indices (0 to 25) of the words of the question self.x_q_c = tf.placeholder(tf.int32, shape=[B, M, C], name="context_question_chars") #output as a one hot encoding of the start position and end position indices over the context words self.y = tf.placeholder(tf.int32, shape=[B, N, 2], name="out") ''' ''' part1: an embedding layer ''' VW, VC, DW, DC = param.word_vocab_size, param.char_vocab_size, param.word_emb_dim, param.char_emb_dim #compute word embeddings of the context words through 300 dimensional GloVe embedding self.x_c_w_emb = my.embedding(inputs=self.x_c_w, shape=[VW, DW], scope="word_embedding", reuse=None) #compute word embeddings of the question words through 300 dimensional GloVe embedding self.x_q_w_emb = my.embedding(inputs=self.x_q_w, scope="word_embedding", reuse=True) #compute through character embeddings of the context words self.x_c_c_emb = my.embedding(inputs=self.x_c_c, shape=[VC, DC], scope="char_embedding", reuse=None) #compute character embeddings of the question words self.x_q_c_emb = my.embedding(inputs=self.x_q_c, scope="char_embedding", reuse=True) #max pooling over character embeddings to get fixed size embedding of each word self.x_c_c_emb = tf.reduce_max(self.x_c_c_emb, reduction_indices=[2]) #concatenate GloVe embedding with character embedding self.x_c_emb = tf.concat(values=[self.x_c_w_emb, self.x_c_c_emb], axis=2, name="x_context_emb") #max pooling over character embeddings to get fixed size embedding of each word self.x_q_c_emb = tf.reduce_max(self.x_q_c_emb, reduction_indices=[2]) #concatenate GloVe embedding with character embedding self.x_q_emb = tf.concat(values=[self.x_q_w_emb, self.x_q_c_emb], axis=2, name="x_question_emb") #apply a highway network of 2 layers on top of computed embedding self.x_c_emb = my.highway_network(inputs=self.x_c_emb, num_layers=param.highway_num_layers, use_bias=True, transform_bias=-1.0, scope='highway_net', reuse=None) self.x_q_emb = my.highway_network(inputs=self.x_q_emb, num_layers=param.highway_num_layers, use_bias=True, transform_bias=-1.0, scope='highway_net', reuse=True) ''' part2: an embedding encoder layer ''' #single encoder block: convolution_layer X # + self_attention_layer + feed_forward_layer #apply 1 encoder stack of 1 encoder block on context embedding self.x_c_enc = my.encoder_block(inputs=self.x_c_emb, num_conv_layer=4, filters=128, kernel_size=7, num_att_head=8, scope='encoder_block', reuse=None) #apply 1 encoder stack of 1 encoder block on question embedding self.x_q_enc = my.encoder_block(inputs=self.x_q_emb, num_conv_layer=4, filters=128, kernel_size=7, num_att_head=8, scope='encoder_block', reuse=True) ''' part3: a context-query attention layer ''' #apply a context-query attention layer to compute context-to-query attention and query-to-context attention self.att_a, self.att_b = my.context_query_attention(context=self.x_c_enc, query=self.x_q_enc, scope='context_query_att', reuse=None) ''' part4: a model encoder layer ''' #apply 3 encoder stacks of 7 encoder blocks #prepare input as [c, a, c dot a, c dot b] where a and b are rows of attention matrix A (att_a) and B (att_b) #computing c dot a self.c_mult_att_a = tf.multiply(self.x_c_enc, self.att_a) #computing c dot b self.c_mult_att_b = tf.multiply(self.x_c_enc, self.att_b) #computing [c, a, c dot a, c dot b] #NOTE: there is an ambiguity here. Since the encoder blocks have to share weights, the input dimensions to each block should remain same, however the startig input is mentioned as a concatenation of four 128 dimensional (=512) hidden states [c, a, c dot a, c dot b] while the blocks above the first block will have inputs of 128 dimensional since a 1D convolution will map the first 512 dimensional input to a 128 dimensional output. To overcome this, an average composition instead of a concat is used over (c, a, c dot a, c dot b) #compute average of [c, a, c dot a, c dot b] tensors #dimension=[B, N, d] ([batch_size, max_words_context, hidden_dimension=128]) self.model_enc = tf.reduce_mean(tf.concat([tf.expand_dims(self.x_c_enc, 2), tf.expand_dims(self.att_a, 2), tf.expand_dims(self.c_mult_att_a, 2), tf.expand_dims(self.c_mult_att_b, 2)], axis=2), axis=2, name="model_enc_inp") #for each encoder stack for i in range(3): #for each encoder block within each stack for j in range(7): #the call to the first model encoder block in each stack will have reuse None to create new weight tensors if (i == 0): self.model_enc = my.encoder_block(inputs=self.model_enc, num_conv_layer=2, filters=128, kernel_size=5, num_att_head=8, scope='model_enc_block_{}'.format(j), reuse=None) #subsequent blocks in each stack (block 2 to 7) will have reuse True since each stack shares weights across blocks else: self.model_enc = my.encoder_block(inputs=self.model_enc, num_conv_layer=2, filters=128, kernel_size=5, num_att_head=8, scope='model_enc_block_{}'.format(j), reuse=True) #after completion of first encoder stack, store output as M0 if (i == 1): #store model_enc as output M0 after completion of run of first stack of model encoder blocks #model encoder blocks executed: 7 #using tf.identity to copy a tensor self.out_m0 = tf.identity(self.model_enc) #store model_enc as output M1 after completion of run of second stack of model encoder blocks #model encoder blocks executed: 14 #after completion of second encoder stack, store output as M1 elif(i==2): self.out_m1 = tf.identity(self.model_enc) #store model_enc as output M2 after completion of run of third stack of model encoder blocks #model encoder blocks executed: 21 #after completion of third encoder stack, store output as M2 else: self.out_m2 = tf.identity(self.model_enc) ''' part5: an output layer ''' #feature vector for position 1 is [M0;M1] self.inp_pos1 = tf.concat((self.out_m0, self.out_m1), axis=2) #feature vector for position 2 is [M0;M2] self.inp_pos2 = tf.concat((self.out_m0, self.out_m2), axis=2) #compute softmax probability scores on positions of context words for being position 1 self.pos1 = tf.nn.softmax(tf.layers.dense(self.inp_pos1, 1, activation=tf.tanh, name='dense_pos1')) #compute softmax probability scores on positions of context words for being position 2 self.pos2 = tf.nn.softmax(tf.layers.dense(self.inp_pos2, 1, activation=tf.tanh, name='dense_pos2')) #concatenate both prediction vectors #dimensions=[B, N, 2] ([batch_size, max_context_words, 2]) self.pred = tf.concat((self.pos1, self.pos2), axis = -1) #loss = -mean(log(p1) + log(p2)) = mean(-log(p1*p2)) self.loss = tf.reduce_mean(-tf.log(tf.reduce_prod(tf.reduce_sum(self.pred * tf.cast(self.y, 'float'), 1), 1) + param.epsilon_1)) #training scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) #using ADAM optimizer with beta1=0.8, beta2=0.999 and epsilon=1e-7 self.optimizer = tf.train.AdamOptimizer(learning_rate=param.lr, beta1=param.beta1, beta2=param.beta2, epsilon=param.epsilon_2) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) #loss summary tf.summary.scalar('loss', self.loss) self.merged = tf.summary.merge_all()
def build_graph(self): # Define input with tf.name_scope("input_ph"): self.X_ind = tf.placeholder(dtype=tf.int32, shape=[None, self.field_size], name="X_index") self.label = tf.placeholder(dtype=tf.float32, shape=[None], name="label") self.is_training = tf.placeholder(dtype=tf.bool, shape=(), name="is_training") # lookup and process embedding with tf.name_scope("embedding"): self.emb = embedding(inputs=self.X_ind, vocab_size=self.feat_size, num_units=self.embedding_dim, scale=self.scale_embedding, scope="embedding_process") # self.emb: raw embedding, features: used for later features = self.emb with tf.name_scope("Multilayer_attn"): with tf.variable_scope("attention_head") as scope: features, _ = multihead_attention( queries=features, keys=features, num_units=self.attention_size*self.num_head, num_heads=self.num_head, dropout_rate=self.dropout_rate, is_training=self.is_training, scope="multihead_attention" ) features = feedforward( inputs=features, num_units=[4 * self.embedding_dim, self.embedding_dim], scope="feed_forward" ) # [N, T, dim] # multi-head feature to agg 1st order feature with tf.name_scope("Agg_first_order") as scope: ctx_order_1 = tf.get_variable( name="context_order_1", shape=(self.attention_size), dtype=tf.float32) agg_feat_1, self.attn_1 = agg_attention( query=ctx_order_1, keys=features, values=features, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) # [N, dim] # build second order cross with tf.name_scope("Second_order") as scope: feat_2 = tf.multiply( features, tf.expand_dims(agg_feat_1, axis=1) ) # [N, T, dim] feat_2 += features # Add the residual, [N, T, dim] ctx_order_2 = tf.get_variable( name="context_order_2", shape=(self.attention_size), dtype=tf.float32 ) agg_feat_2, self.attn_2 = agg_attention( query=ctx_order_2, keys=feat_2, values=feat_2, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) # build third order cross with tf.name_scope("Third_order") as scope: feat_3 = tf.multiply( features, tf.expand_dims(agg_feat_2, axis=1) ) # [N, T, dim] feat_3 += feat_2 # Add the residual, [N, T, dim] ctx_order_3 = tf.get_variable( name="context_order_3", shape=(self.attention_size), dtype=tf.float32 ) agg_feat_3, self.attn_3 = agg_attention( query=ctx_order_3, keys=feat_3, values=feat_3, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) with tf.name_scope("Merged_features"): # concatenate [enc, second_cross, third_cross] # TODO: can + multihead_features all_features = tf.stack([ agg_feat_1, agg_feat_2, agg_feat_3, ], axis=1, name="concat_feature") # (N, k, C) # map C to pool_filter_size dimension mapped_all_feature = tf.layers.conv1d( inputs=all_features, filters=self.pool_filter_size, kernel_size=1, use_bias=True, name="Mapped_all_feature" ) # (N, k, pf_size) # apply context vector feature_weights = tf.nn.softmax( tf.squeeze( tf.layers.dense( mapped_all_feature, units=1, activation=None, use_bias=False ), # (N, k, 1), [2] ), # (N, k) ) # (N, k) self.attn_k = feature_weights # weighted sum weighted_sum_feat = tf.reduce_sum( tf.multiply( all_features, tf.expand_dims(feature_weights, axis=2), ), # (N, k, C) axis=[1], name="Attn_weighted_sum_feature" ) # (N, C) # last non-linear hidden_logits = tf.layers.dense( weighted_sum_feat, units=self.embedding_dim // 2, activation=tf.nn.relu, use_bias=False, name="HiddenLogits" ) # (N, C/2) # the last dense for logits logits = tf.squeeze( tf.layers.dense( hidden_logits, units=1, activation=None, use_bias=False, name="Logits" ), # (N, 1) axis=[1] ) # (N,) # sigmoid logits self.sigmoid_logits = tf.nn.sigmoid(logits) # regularization term self.regularization_loss = tf.losses.get_regularization_loss() self.logloss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.expand_dims(self.label, -1), logits=tf.expand_dims(logits, -1), name="SumLogLoss")) self.mean_logloss = tf.divide( self.logloss, tf.to_float(self.batch_size), name="MeanLogLoss" ) # overall loss self.overall_loss = tf.add( self.mean_logloss, self.regularization_loss, name="OverallLoss" ) tf.summary.scalar("Mean_LogLoss", self.mean_logloss) tf.summary.scalar("Reg_Loss", self.regularization_loss) tf.summary.scalar("Overall_Loss", self.overall_loss) self.train_op = self.optimizer.minimize(self.overall_loss, global_step=self.global_step) self.merged = tf.summary.merge_all()
def __init__(self): self.graph = tf.Graph() self.tensor_info = {} self.build_inputs() with self.graph.as_default(): self.saver = tf.train.Saver(max_to_keep=1) #dien with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask_ph, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs, att_scores=tf.expand_dims(alphas, -1), sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") #dsin #with tf.name_scope("Self_Attention_layer"): hidden_units = 512 num_blocks = 6 num_heads = 8 dropout_rate = 0.1 with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.recent_behavior_ph, vocab_size=USER_API_SUM, # len(de2idx), 200 num_units=hidden_units, #128 zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") #self.enc = self.user_api_all_eb #FLAGS.batch_size,USER_API_LEN batch = self.recent_behavior_ph.get_shape().as_list() batch = tf.shape(self.recent_behavior_ph) self.enc += tf.cast( positional_encoding(N=tf.shape(self.recent_behavior_ph)[0], T=USER_API_LEN, num_units=hidden_units, zero_pad=False, scale=False, scope='enc_pe'), tf.float32) ##Drop out #self.enc = tf.layers.dropout(self.enc,rate = dropout_rate, # training = tf.convert_to_tensor(is_training)) ## Blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention[128, 10, 512] 不变 self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hidden_units, num_heads=num_heads, dropout_rate=dropout_rate, #is_training = is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hidden_units, hidden_units]) # Final linear projection #self.logits = tf.layers.dense(self.dec,USER_API_LEN*3)) # print(self.enc.get_shape().as_list()) # print(tf.shape(self.enc)) self.user_api_eb_sum = tf.reduce_sum(self.enc, -2) inp = tf.concat([ self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2, self.mobile_embedded, self.province_embedded, self.city_embedded, self.grade_embedded, self.chinese_embedded, self.math_embedded, self.english_embedded, self.purchase_embedded, self.activity_embedded, self.freshness_embedded, self.hour_embedded, self.ad_img_eb_sum, self.user_api_eb_sum ], -1) self.build_fcn_net( inp, use_dice=True, )
class Model(object): def __init__(self, mode): self.models = [] self.inputs_transcript = [] self.inputs_reference = [] self.inputs_reference_lengths =[] self.inpus_speaker = [] self.inputs_decoder = [] self.labels = [] self.memory = [] self.mel_hat = [] self.alignments = [] self.mag_hat = [] self.wavform = [] for gpu_id in range(Hp.num_gpus): with tf.device('gpu:%d'%gpu_id): with tf.name_scope('tower_%d'%gpu_id) as scope: with tf.variable_scope('cpu_variables', reuse=gpu_id>0): self.inputs_transcript.append(tf.placeholder(tf.int32, shape=[None,Hp.num_charac], name = "inputs_transcript") )#text [batch, Tx] self.inputs_reference.append(tf.placeholder(tf.float32, #ref audio melspectrogrom [batch, Ty(?)//r, n_mels*r] shape=[None, None, Hp.num_mels * Hp.reduction_factor], name = "inputs_reference")) self.inputs_reference_lengths.append(tf.placeholder(tf.float32, shape=[None,1]), name = "inputs_reference_lengths") self.inpus_speaker.append(tf.placeholder(tf.int32, shape=[None,1], name = "intpus_speaker")) #speaker id [batch, 1] inpus_decoder.append(tf.placeholder(tf.float32, # decoder melspectrogrom [batch, Ty//r, n_mels*r] shape=[None, None, Hp.num_mels * Hp.reduction_factor], name = "inputs_decoder") ) labels.append(tf.placeholder(tf.float32, shape=[None, None, Hp.num_fft//2+1])) #magnitude training = True if mode="train" else False # Encoder # transcript encoder text = modules.transcript_encoder( inputs = self.inputs_transcript[gpu_id], embed_size = Hp.charac_embed_size, K = Hp.num_encoder_banks, highway_layers = Hp.num_enc_highway_layers, training = training) # outputs: [Batch_size, Text length, 256] text =tf.identity(text, name = "text_enc") # reference encoder if mode == "train": batch_size = Hp.train_batch_size elif mode == "eval": batch_size = Hp.eval_batch_size else batch_size = Hp.synthes_batch_size inputs_reference = tf.reshape(self.inputs_reference[gpu_id], [batch_size, -1, Hp.num_mels]) # expand the dims inputs_reference [batch, Ty, n_mels] from 3 to 4 for conv2d [batch,Ty, n_mels, 1] inputs_reference = tf.expand_dims(inputs_reference, -1) prosody = modules.reference_encoder(inputs = inputs_reference, training = training) #[batch, 128] prosody = tf.expand_dims(prosody,1) #[batch, 1 ,128] #[batch, Tx, 128] replicate prosody for all Tx steps prosody = tf.tile(prosody, [1, Hp.num_charac, 1], name = "prosody_enc") # speaker speaker = modules.embedding( inputs = self.inputs_speaker[gpu_id], charac_size = Hp.num_speakers, embed_size = Hp.speaker_embed_size) # [batch, speaker_embed_size] speaker = tf.expand_dims(speaker, 1) speaker = tf.tile(speaker, [1, Hp.num_charac, 1], name = "speaker_embed") memory = tf.concat([text, prosody, speaker], axis = -1, name = "memory") # [batch, Tx, Dt+Ds+Dp ] self.memory.append(memory) # Spectrogrom Decoder # we concat f0 frame and remove the last frame of original melspectrogrom since it will not be sent to the deconder intpus_decoder = tf.concat((tf.zeros_like(self.intpus_decoder[gpu_id][:,:1,:]), self.intpus_decoder[gpu_id][:,:-1,:]), 1) #[batch, Ty/r, num_mels*r] mel_hat, alignments = attention_gru_decoder( inputs = inputs_decoder, inputs_lengths = self.inputs_reference_lengths[gpu_id], memory = memory, attention_rnn_nodes = Hp.num_attention_nodes, decoder_rnn_nodes = Hp.num_decoder_nodes, num_mels = Hp.num_mels, reduction_factor = Hp.reduction_factor, max_iters = Hp.max_iters training = training) #[batch, Ty/r, num_mels*r] alignments = tf.identity(alignments, name = "alignments") mel_hat =tf.identity(mel_hat, name = "melspectrogrom_pred") mag_hat = modules.cbhg_postprocessing( inputs = mel_hat, num_mels = self.num_mels, num_fft = self.num_fft, K = self.num_post_banks, highway_layers = self.num_post_highway_layers, training = training) # [batch, Ty, 1+n_fft//2] mag_hat =tf.identity(mag_hat, name = "magnitude_pred") wavform = tf.py_func(signal_process.Spectrogrom2Wav, [mag_hat[0]], tf.float32, name = "wavform") self.mel_hat.append(mel_hat) self.alignments.append(alignments) self.mag_hat.append(mag_hat) self.wavform.append(wavform)