Пример #1
0
    def __init__(self, nuser, nloc, ntime, nquadkey, user_dim, loc_dim, time_dim, reg_dim, nhid, nhead_enc, nhead_dec, nlayers, dropout=0.5, **extra_config):
        super(QuadKeyLocPredictor, self).__init__()
        self.emb_user = embedding(nuser, user_dim, zeros_pad=True, scale=True)
        self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True)
        self.emb_reg = embedding(nquadkey, reg_dim, zeros_pad=True, scale=True)
        self.emb_time = embedding(ntime, time_dim, zeros_pad=True, scale=True)
        ninp = user_dim
        pos_encoding = extra_config.get("position_encoding", "transformer")
        if pos_encoding == "embedding":
            self.pos_encoder = PositionalEmbedding(loc_dim + reg_dim, dropout)
        elif pos_encoding == "transformer":
            self.pos_encoder = PositionalEncoding(loc_dim + reg_dim, dropout)
        self.enc_layer = TransformerEncoderLayer(loc_dim + reg_dim, nhead_enc, loc_dim + reg_dim, dropout)
        self.encoder = TransformerEncoder(self.enc_layer, nlayers)
        self.region_pos_encoder = PositionalEmbedding(reg_dim, dropout, max_len=20)
        self.region_enc_layer = TransformerEncoderLayer(reg_dim, 1, reg_dim, dropout=dropout)
        self.region_encoder = TransformerEncoder(self.region_enc_layer, 2)
        if not extra_config.get("use_location_only", False):
            if extra_config.get("embedding_fusion", "multiply") == "concat":
                if extra_config.get("user_embedding", False):
                    self.lin = nn.Linear(user_dim + loc_dim + reg_dim + time_dim, ninp)
                else:
                    self.lin = nn.Linear(loc_dim + reg_dim, ninp)

        ident_mat = torch.eye(ninp)
        self.register_buffer('ident_mat', ident_mat)

        self.layer_norm = nn.LayerNorm(ninp)
        self.extra_config = extra_config
        self.dropout = dropout
Пример #2
0
    def __init__(self, nuser, nloc, ntime, nreg, user_dim, loc_dim, time_dim, reg_dim, nhid, nhead_enc, nhead_dec, nlayers, dropout=0.5, **extra_config):
        super(LocPredictor, self).__init__()
        self.emb_user = embedding(nuser, user_dim, zeros_pad=True, scale=True)
        self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True)
        self.emb_reg = embedding(nreg, reg_dim, zeros_pad=True, scale=True)
        self.emb_time = embedding(ntime, time_dim, zeros_pad=True, scale=True)
        if not ((user_dim == loc_dim) and (user_dim == time_dim) and (user_dim == reg_dim)):
            raise Exception('user, location, time and region should have the same embedding size')
        ninp = user_dim
        pos_encoding = extra_config.get("position_encoding", "transformer")
        if pos_encoding == "embedding":
            self.pos_encoder = PositionalEmbedding(ninp, dropout)
        elif pos_encoding == "transformer":
            self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.enc_layer = TransformerEncoderLayer(ninp, nhead_enc, nhid, dropout)
        self.encoder = TransformerEncoder(self.enc_layer, nlayers)
        if not extra_config.get("use_location_only", False):
            if extra_config.get("embedding_fusion", "multiply") == "concat":
                if extra_config.get("user_embedding", False):
                    self.lin = nn.Linear(user_dim + loc_dim + reg_dim + time_dim, ninp)
                else:
                    self.lin = nn.Linear(loc_dim + reg_dim + time_dim, ninp)

        ident_mat = torch.eye(ninp)
        self.register_buffer('ident_mat', ident_mat)

        self.layer_norm = nn.LayerNorm(ninp)
        self.extra_config = extra_config
        self.dropout = dropout
Пример #3
0
    def build_embedding_layer(self, inputs, reuse=None):
        self.emb_char = embedding(inputs,
                                  vocab_size=self.vocab_size,
                                  num_units=self.hidden_units,
                                  scale=True,
                                  scope="emb_char",
                                  reuse=reuse)
        self.emb_char_pos = self.emb_char
        if self.emb_pos_type == 'sin':
            self.emb_char_pos += positional_encoding(inputs,
                                                     num_units=self.hidden_units,
                                                     zero_pad=False,
                                                     scale=False,
                                                     scope="emb_pos",
                                                     reuse=reuse)
        else:
            self.emb_char_pos += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(inputs)[1]), 0), [tf.shape(inputs)[0], 1]),
                                           vocab_size=self.maxlen,
                                           num_units=self.hidden_units,
                                           zero_pad=False,
                                           scale=False,
                                           scope="emb_pos",
                                           reuse=reuse)

        self.emb = tf.layers.dropout(self.emb_char_pos, rate=self.dropout,)

        return self.emb
Пример #4
0
 def train(self):
     self.text, self.refer_mel, self.mel, self.linear = get_next_batch()
     self.encoder_inputs = embedding(self.text, scope='embedding', reuse=self.reuse)
     self.decoder_inputs = tf.concat((tf.zeros_like(self.mel[:, :1, :]), self.mel[:, :-1, :]), 1)
     self.decoder_inputs = self.decoder_inputs[:, :, -hp.N_MELS:]
     with tf.variable_scope(self.scope_name):
         self.text_outputs = encoder(self.encoder_inputs, is_training=self.is_training)
         self.vae_outputs, self.mu, self.log_var = vae(self.refer_mel, is_training=self.is_training)
         self.encoder_outputs = self.text_outputs + self.vae_outputs
         self.mel_hat, self.alignments = decoder(self.decoder_inputs,
                                                self.encoder_outputs,
                                                is_training=self.is_training)
         self.linear_hat = postnet(self.mel_hat, is_training=self.is_training)
     if self.mode in ['train', 'eval']:
         self.global_step = tf.get_variable('global_step', initializer=0, dtype=tf.int32, trainable=False)
         self.lr = tf.train.exponential_decay(learning_rate=hp.LR, global_step=self.global_step,
                                              decay_steps=hp.DECAY_STEPS,
                                              decay_rate=hp.DECAY_RATE)
         self.optimizer = tf.train.AdamOptimizer(self.lr)
         self.mel_loss = tf.reduce_mean(tf.abs(self.mel_hat - self.mel))
         self.linear_loss = tf.reduce_mean(tf.abs(self.linear_hat - self.linear))
         self.kl_loss = - 0.5 * tf.reduce_sum(1 + self.log_var - tf.pow(self.mu, 2) - tf.exp(self.log_var))
         self.vae_loss_weight = control_weight(self.global_step)
         self.loss = self.mel_loss + self.linear_loss + self.vae_loss_weight * self.kl_loss
         self.
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen,
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="enc_pe")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="dec_pe")
            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
Пример #6
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Пример #7
0
 def __init__(self, nloc, loc_dim, num_layers=1, dropout=0.0):
     super(GRU4Rec, self).__init__()
     self.emb_loc = embedding(nloc, loc_dim, zeros_pad=True, scale=True)
     self.encoder = torch.nn.GRU(input_size=loc_dim, hidden_size=loc_dim, num_layers=num_layers, dropout=dropout)
     self.h_0 = nn.Parameter(torch.randn((num_layers, 1, loc_dim), requires_grad=True))
Пример #8
0
 def __init__(self, d_model, dropout=0.1, max_len=120):
     super(PositionalEmbedding, self).__init__()
     self.pos_emb_table = embedding(max_len, d_model, zeros_pad=False, scale=False)
     pos_vector = torch.arange(max_len)
     self.dropout = nn.Dropout(p=dropout)
     self.register_buffer('pos_vector', pos_vector)
Пример #9
0
    def single_model(self, gpu_id):
        if self.mode == "train" or self.mode == "eval":
            inputs_transcript = self.inputs_transcript[gpu_id]
            inputs_reference = self.inputs_reference[gpu_id]
            inputs_ref_lens = self.inputs_ref_lens[gpu_id]
            inputs_speaker = self.inputs_speaker[gpu_id]
            inputs_decoder = self.inputs_decoder[gpu_id]

        training = True if self.mode == "train" else False

        # Encoder
        # transcript encoder
        text = modules.transcript_encoder(
            inputs=inputs_transcript,
            embed_size=Hp.charac_embed_size,
            K=Hp.num_encoder_banks,
            highway_layers=Hp.num_enc_highway_layers,
            training=training)  # outputs: [Batch_size, Text length, 256]

        text = tf.identity(text, name="text_enc")

        # reference encoder
        if self.mode == "train":
            batch_size = Hp.train_batch_size // Hp.num_gpus
        elif self.mode == "eval":
            batch_size = Hp.eval_batch_size // Hp.num_gpus
        else:
            batch_size = Hp.synthes_batch_size // HP.num_gpus

        inputs_reference_reshape = tf.reshape(inputs_reference,
                                              [batch_size, -1, Hp.num_mels])
        # expand the dims inputs_reference [batch, Ty, n_mels] from 3 to 4 for conv2d [batch,Ty, n_mels, 1]
        inputs_reference_reshape = tf.expand_dims(inputs_reference_reshape, -1)

        prosody = modules.reference_encoder(inputs=inputs_reference_reshape,
                                            training=training)  #[batch, 128]
        prosody = tf.expand_dims(prosody, 1)  #[batch, 1 ,128]

        #[batch, Tx, 128] replicate prosody for all Tx steps
        prosody = tf.tile(prosody, [1, Hp.num_charac, 1], name="prosody_enc")

        # speaker
        speaker = modules.embedding(
            inputs=inputs_speaker,
            charac_size=Hp.num_speakers,
            embed_size=Hp.speaker_embed_size,
            scope="speaker")  # [batch, 1, speaker_embed_size] [32,1,16]

        speaker = tf.tile(speaker, [1, Hp.num_charac, 1], name="speaker_embed")

        memory = tf.concat([text, prosody, speaker], axis=-1,
                           name="memory")  # [batch, Tx, Dt+Ds+Dp ]
        #self.memory.append(memory)

        # Spectrogrom Decoder
        # we concat f0 frame and remove the last frame of original melspectrogrom since it will not be sent to the deconder
        if self.mode == "train":
            inputs_decoder = tf.concat((tf.zeros_like(
                inputs_decoder[:, :1, :]), inputs_decoder[:, :-1, :]),
                                       1)  #[batch, Ty/r, num_mels*r]

        mel_hat, alignments = modules.attention_gru_decoder(
            inputs=inputs_decoder,
            inputs_lengths=inputs_ref_lens,
            memory=memory,
            attention_rnn_nodes=Hp.num_attention_nodes,
            decoder_rnn_nodes=Hp.num_decoder_nodes,
            num_mels=Hp.num_mels,
            reduction_factor=Hp.reduction_factor,
            max_iters=self.max_len_per_batch,
            training=training)  #[batch, Ty/r, num_mels*r]

        alignments = tf.identity(alignments, name="alignments")
        mel_hat = tf.identity(mel_hat, name="melspectrogrom_pred")

        mag_hat = modules.cbhg_postprocessing(
            inputs=mel_hat,
            num_mels=Hp.num_mels,
            num_fft=Hp.num_fft,
            K=Hp.num_post_banks,
            highway_layers=Hp.num_post_highway_layers,
            training=training)  # [batch, Ty, 1+n_fft//2]

        mag_hat = tf.identity(mag_hat, name="magnitude_pred")

        #wavform = tf.py_func(signal_process.Spectrogrom2Wav, [mag_hat[0]], tf.float32, name = "wavform")  # generate a sample to listen to

        return mel_hat, alignments, mag_hat
Пример #10
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                cells = self.rnn_cell()
                encoder_output, _encoder_state = tf.nn.dynamic_rnn(
                    cells,
                    self.enc,
                    sequence_length=self.x_len,
                    dtype=tf.float32)
                self.enc = tf.concat([self.enc, encoder_output], axis=-1)
                self.enc = tf.layers.dense(self.enc,
                                           hp.emb_dim,
                                           activation="relu")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    pos_emb = tf.get_variable(
                        'enc_pos_emb',
                        dtype=tf.float32,
                        shape=[self.enc.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        pos_emb=pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                cells = self.rnn_cell()
                decoder_output, _decoder_state = tf.nn.dynamic_rnn(
                    cells,
                    self.dec,
                    sequence_length=self.y_len,
                    dtype=tf.float32)
                self.dec = tf.concat([self.dec, decoder_output], axis=-1)
                self.dec = tf.layers.dense(self.dec,
                                           hp.emb_dim,
                                           activation="relu")

            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    dec_dec_pos_emb = tf.get_variable(
                        'dec_de_pos_emb',
                        dtype=tf.float32,
                        shape=[self.dec.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    dec_enc_pos_emb = tf.get_variable(
                        'dec_enc_pos_emb',
                        dtype=tf.float32,
                        shape=[self.enc.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        pos_emb=dec_dec_pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        pos_emb=dec_enc_pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
Пример #12
0
 def __init__(self):
     self.graph = tf.Graph()
     with self.graph.as_default():
         #paceholders for inputs and outputs
         B, N, M, C = param.batch_size, param.max_context_words, param.max_question_words, param.max_chars
         #get inputs and outputs
         self.x_c_w, self.x_c_c, self.x_q_w, self.x_q_c, self.y = my.get_batch_data()
         '''
         #can also use placeholders as below if needed:
         #input sequence of word vocabulary indices of the context
         self.x_c_w = tf.placeholder(tf.int32, shape=[B, N], name="context_words")
         #input sequence of char vocabulary indices (0 to 25) of the words of the context
         self.x_c_c = tf.placeholder(tf.int32, shape=[B, N, C], name="context_word_chars")
         #input sequence of question vocabulary indices of the context
         self.x_q_w =  tf.placeholder(tf.int32, shape=[B, M], name="question_words")
         #input sequence of char vocabulary indices (0 to 25) of the words of the question
         self.x_q_c = tf.placeholder(tf.int32, shape=[B, M, C], name="context_question_chars")
         #output as a one hot encoding of the start position and end position indices over the context words
         self.y = tf.placeholder(tf.int32, shape=[B, N, 2], name="out")
         '''
                    
                    
                    
         '''          
         part1: an embedding layer
         '''
         VW, VC, DW, DC = param.word_vocab_size, param.char_vocab_size, param.word_emb_dim, param.char_emb_dim 
         #compute word embeddings of the context words through 300 dimensional GloVe embedding
         self.x_c_w_emb = my.embedding(inputs=self.x_c_w, shape=[VW, DW], scope="word_embedding", reuse=None)
         #compute word embeddings of the question words through 300 dimensional GloVe embedding
         self.x_q_w_emb = my.embedding(inputs=self.x_q_w, scope="word_embedding", reuse=True)
         #compute through character embeddings of the context words
         self.x_c_c_emb = my.embedding(inputs=self.x_c_c, shape=[VC, DC], scope="char_embedding", reuse=None)
         #compute character embeddings of the question words
         self.x_q_c_emb = my.embedding(inputs=self.x_q_c, scope="char_embedding", reuse=True)
         
         #max pooling over character embeddings to get fixed size embedding of each word
         self.x_c_c_emb = tf.reduce_max(self.x_c_c_emb, reduction_indices=[2])
         #concatenate GloVe embedding with character embedding
         self.x_c_emb = tf.concat(values=[self.x_c_w_emb, self.x_c_c_emb], axis=2, name="x_context_emb")
         #max pooling over character embeddings to get fixed size embedding of each word
         self.x_q_c_emb = tf.reduce_max(self.x_q_c_emb, reduction_indices=[2])
         #concatenate GloVe embedding with character embedding
         self.x_q_emb = tf.concat(values=[self.x_q_w_emb, self.x_q_c_emb], axis=2, name="x_question_emb")            
         
         #apply a highway network of 2 layers on top of computed embedding
         self.x_c_emb = my.highway_network(inputs=self.x_c_emb, num_layers=param.highway_num_layers, use_bias=True, transform_bias=-1.0, scope='highway_net', reuse=None)
         self.x_q_emb = my.highway_network(inputs=self.x_q_emb, num_layers=param.highway_num_layers, use_bias=True, transform_bias=-1.0, scope='highway_net',  reuse=True)            
         
         
         '''
         part2: an embedding encoder layer
         '''
         #single encoder block: convolution_layer X # + self_attention_layer + feed_forward_layer
         #apply 1 encoder stack of 1 encoder block on context embedding
         self.x_c_enc = my.encoder_block(inputs=self.x_c_emb, num_conv_layer=4, filters=128, kernel_size=7, num_att_head=8, scope='encoder_block', reuse=None)
         #apply 1 encoder stack of 1 encoder block on question embedding
         self.x_q_enc = my.encoder_block(inputs=self.x_q_emb, num_conv_layer=4, filters=128, kernel_size=7, num_att_head=8, scope='encoder_block', reuse=True)
         
         
         '''           
         part3: a context-query attention layer
         '''
         #apply a context-query attention layer to compute context-to-query attention and query-to-context attention
         self.att_a, self.att_b = my.context_query_attention(context=self.x_c_enc, query=self.x_q_enc, scope='context_query_att', reuse=None)
         
         
         '''
         part4: a model encoder layer
         ''' 
         #apply 3 encoder stacks of 7 encoder blocks            
         #prepare input as [c, a, c dot a, c dot b] where a and b are rows of attention matrix A (att_a) and B (att_b)
         #computing c dot a
         self.c_mult_att_a = tf.multiply(self.x_c_enc, self.att_a)
         #computing c dot b
         self.c_mult_att_b = tf.multiply(self.x_c_enc, self.att_b)
         #computing [c, a, c dot a, c dot b] 
         #NOTE: there is an ambiguity here. Since the encoder blocks have to share weights, the input dimensions to each block should remain same, however the startig input is mentioned as a concatenation of four 128 dimensional (=512) hidden states [c, a, c dot a, c dot b] while the blocks above the first block will have inputs of 128 dimensional since a 1D convolution will map the first 512 dimensional input to a 128 dimensional output. To overcome this, an average composition instead of a concat is used over (c, a, c dot a, c dot b)
         #compute average of [c, a, c dot a, c dot b] tensors 
         #dimension=[B, N, d] ([batch_size, max_words_context, hidden_dimension=128])
         self.model_enc = tf.reduce_mean(tf.concat([tf.expand_dims(self.x_c_enc, 2), tf.expand_dims(self.att_a, 2), tf.expand_dims(self.c_mult_att_a, 2), tf.expand_dims(self.c_mult_att_b, 2)], axis=2), axis=2, name="model_enc_inp")            
         #for each encoder stack
         for i in range(3):     
             #for each encoder block within each stack           
             for j in range(7):
                 #the call to the first model encoder block in each stack will have reuse None to create new weight tensors
                 if (i == 0):
                     self.model_enc = my.encoder_block(inputs=self.model_enc, num_conv_layer=2, filters=128, kernel_size=5, num_att_head=8, scope='model_enc_block_{}'.format(j), reuse=None)
                 #subsequent blocks in each stack (block 2 to 7) will have reuse True since each stack shares weights across blocks
                 else:
                     self.model_enc = my.encoder_block(inputs=self.model_enc, num_conv_layer=2, filters=128, kernel_size=5, num_att_head=8, scope='model_enc_block_{}'.format(j), reuse=True)
             #after completion of first encoder stack, store output as M0
             if (i == 1):
                 #store model_enc as output M0 after completion of run of first stack of model encoder blocks
                 #model encoder blocks executed: 7
                 #using tf.identity to copy a tensor
                 self.out_m0 = tf.identity(self.model_enc)
                 #store model_enc as output M1 after completion of run of second stack of model encoder blocks
                 #model encoder blocks executed: 14
             #after completion of second encoder stack, store output as M1
             elif(i==2):
                 self.out_m1 = tf.identity(self.model_enc)
                 #store model_enc as output M2 after completion of run of third stack of model encoder blocks
                 #model encoder blocks executed: 21
             #after completion of third encoder stack, store output as M2
             else:
                 self.out_m2 = tf.identity(self.model_enc)            
                 
                 
         '''        
         part5: an output layer      
         '''
         #feature vector for position 1 is [M0;M1]
         self.inp_pos1 = tf.concat((self.out_m0, self.out_m1), axis=2)
         #feature vector for position 2 is [M0;M2]
         self.inp_pos2 = tf.concat((self.out_m0, self.out_m2), axis=2)              
         #compute softmax probability scores on positions of context words for being position 1
         self.pos1 = tf.nn.softmax(tf.layers.dense(self.inp_pos1, 1, activation=tf.tanh, name='dense_pos1'))
         #compute softmax probability scores on positions of context words for being position 2
         self.pos2 = tf.nn.softmax(tf.layers.dense(self.inp_pos2, 1, activation=tf.tanh, name='dense_pos2'))
         #concatenate both prediction vectors
         #dimensions=[B, N, 2] ([batch_size, max_context_words, 2])
         self.pred = tf.concat((self.pos1, self.pos2), axis = -1)
         #loss = -mean(log(p1) + log(p2)) = mean(-log(p1*p2))
         self.loss = tf.reduce_mean(-tf.log(tf.reduce_prod(tf.reduce_sum(self.pred * tf.cast(self.y, 'float'), 1), 1) + param.epsilon_1))            
         
         #training scheme
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
         #using ADAM optimizer with beta1=0.8, beta2=0.999 and epsilon=1e-7
         self.optimizer = tf.train.AdamOptimizer(learning_rate=param.lr, beta1=param.beta1, beta2=param.beta2, epsilon=param.epsilon_2)
         self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
         #loss summary
         tf.summary.scalar('loss', self.loss)
         self.merged = tf.summary.merge_all()
Пример #13
0
    def build_graph(self):

        # Define input
        with tf.name_scope("input_ph"):
            self.X_ind = tf.placeholder(dtype=tf.int32, 
                                        shape=[None, self.field_size], 
                                        name="X_index")
            self.label = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name="label")
            self.is_training = tf.placeholder(dtype=tf.bool, 
                                              shape=(), 
                                              name="is_training")

        # lookup and process embedding
        with tf.name_scope("embedding"):
            self.emb = embedding(inputs=self.X_ind,
                                 vocab_size=self.feat_size,
                                 num_units=self.embedding_dim,
                                 scale=self.scale_embedding,
                                 scope="embedding_process")

        # self.emb: raw embedding, features: used for later
        features = self.emb

        with tf.name_scope("Multilayer_attn"):
            with tf.variable_scope("attention_head") as scope:
                features, _ = multihead_attention(
                    queries=features,
                    keys=features,
                    num_units=self.attention_size*self.num_head,
                    num_heads=self.num_head,
                    dropout_rate=self.dropout_rate,
                    is_training=self.is_training,
                    scope="multihead_attention"
                )

                features = feedforward(
                    inputs=features,
                    num_units=[4 * self.embedding_dim,
                               self.embedding_dim],
                    scope="feed_forward"
                )  # [N, T, dim]

        # multi-head feature to agg 1st order feature
        with tf.name_scope("Agg_first_order") as scope:
            ctx_order_1 = tf.get_variable(
                name="context_order_1",
                shape=(self.attention_size),
                dtype=tf.float32)

            agg_feat_1, self.attn_1 = agg_attention(
                query=ctx_order_1,
                keys=features,
                values=features,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )  # [N, dim]

        # build second order cross
        with tf.name_scope("Second_order") as scope:
            feat_2 = tf.multiply(
                features,
                tf.expand_dims(agg_feat_1, axis=1)
                )  # [N, T, dim]

            feat_2 += features  # Add the residual, [N, T, dim]

            ctx_order_2 = tf.get_variable(
                name="context_order_2",
                shape=(self.attention_size),
                dtype=tf.float32
                )

            agg_feat_2, self.attn_2 = agg_attention(
                query=ctx_order_2,
                keys=feat_2,
                values=feat_2,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )

        # build third order cross
        with tf.name_scope("Third_order") as scope:
            feat_3 = tf.multiply(
                features,
                tf.expand_dims(agg_feat_2, axis=1)
                )  # [N, T, dim]

            feat_3 += feat_2  # Add the residual, [N, T, dim]

            ctx_order_3 = tf.get_variable(
                name="context_order_3",
                shape=(self.attention_size),
                dtype=tf.float32
                )

            agg_feat_3, self.attn_3 = agg_attention(
                query=ctx_order_3,
                keys=feat_3,
                values=feat_3,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )

        with tf.name_scope("Merged_features"):

            # concatenate [enc, second_cross, third_cross]
            # TODO: can + multihead_features
            all_features = tf.stack([
                agg_feat_1,
                agg_feat_2,
                agg_feat_3,
                ],
                axis=1, name="concat_feature")  # (N, k, C)

        # map C to pool_filter_size dimension
        mapped_all_feature = tf.layers.conv1d(
            inputs=all_features,
            filters=self.pool_filter_size,
            kernel_size=1,
            use_bias=True,
            name="Mapped_all_feature"
        )  # (N, k, pf_size)
        
        # apply context vector
        feature_weights = tf.nn.softmax(
            tf.squeeze(
                tf.layers.dense(
                    mapped_all_feature,
                    units=1,
                    activation=None,
                    use_bias=False
                ),  # (N, k, 1),
                [2]
            ), # (N, k)
        )  # (N, k)

        self.attn_k = feature_weights
        
        # weighted sum
        weighted_sum_feat = tf.reduce_sum(
            tf.multiply(
                all_features,
                tf.expand_dims(feature_weights, axis=2),
            ),  # (N, k, C)
            axis=[1],
            name="Attn_weighted_sum_feature"
        )  # (N, C)
        
        # last non-linear
        hidden_logits = tf.layers.dense(
            weighted_sum_feat,
            units=self.embedding_dim // 2,
            activation=tf.nn.relu,
            use_bias=False,
            name="HiddenLogits"
        )  # (N, C/2)

        # the last dense for logits
        logits = tf.squeeze(
            tf.layers.dense(
                hidden_logits,
                units=1,
                activation=None,
                use_bias=False,
                name="Logits"
            ),  # (N, 1)
            axis=[1]
        )  # (N,)

        # sigmoid logits
        self.sigmoid_logits = tf.nn.sigmoid(logits)

        # regularization term
        self.regularization_loss = tf.losses.get_regularization_loss()

        self.logloss = tf.reduce_sum(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf.expand_dims(self.label, -1),
                logits=tf.expand_dims(logits, -1),
                name="SumLogLoss"))

        self.mean_logloss = tf.divide(
            self.logloss,
            tf.to_float(self.batch_size),
            name="MeanLogLoss"
            )

        # overall loss
        self.overall_loss = tf.add(
            self.mean_logloss,
            self.regularization_loss,
            name="OverallLoss"
        )
        
        tf.summary.scalar("Mean_LogLoss", self.mean_logloss)
        tf.summary.scalar("Reg_Loss", self.regularization_loss)
        tf.summary.scalar("Overall_Loss", self.overall_loss)

        self.train_op = self.optimizer.minimize(self.overall_loss, 
                                                global_step=self.global_step)
        self.merged = tf.summary.merge_all()
Пример #14
0
    def __init__(self):
        self.graph = tf.Graph()
        self.tensor_info = {}

        self.build_inputs()

        with self.graph.as_default():
            self.saver = tf.train.Saver(max_to_keep=1)

            #dien
            with tf.name_scope('rnn_1'):
                rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE),
                                             inputs=self.item_his_eb,
                                             sequence_length=self.seq_len_ph,
                                             dtype=tf.float32,
                                             scope="gru1")
            with tf.name_scope('Attention_layer_1'):
                att_outputs, alphas = din_fcn_attention(self.item_eb,
                                                        rnn_outputs,
                                                        ATTENTION_SIZE,
                                                        self.mask_ph,
                                                        softmax_stag=1,
                                                        stag='1_1',
                                                        mode='LIST',
                                                        return_alphas=True)
            with tf.name_scope('rnn_2'):
                rnn_outputs2, final_state2 = dynamic_rnn(
                    VecAttGRUCell(HIDDEN_SIZE),
                    inputs=rnn_outputs,
                    att_scores=tf.expand_dims(alphas, -1),
                    sequence_length=self.seq_len_ph,
                    dtype=tf.float32,
                    scope="gru2")

            #dsin
            #with tf.name_scope("Self_Attention_layer"):

            hidden_units = 512
            num_blocks = 6
            num_heads = 8
            dropout_rate = 0.1

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.recent_behavior_ph,
                    vocab_size=USER_API_SUM,  #   len(de2idx), 200
                    num_units=hidden_units,  #128
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")
                #self.enc = self.user_api_all_eb
                #FLAGS.batch_size,USER_API_LEN
                batch = self.recent_behavior_ph.get_shape().as_list()
                batch = tf.shape(self.recent_behavior_ph)
                self.enc += tf.cast(
                    positional_encoding(N=tf.shape(self.recent_behavior_ph)[0],
                                        T=USER_API_LEN,
                                        num_units=hidden_units,
                                        zero_pad=False,
                                        scale=False,
                                        scope='enc_pe'), tf.float32)

                ##Drop out
                #self.enc = tf.layers.dropout(self.enc,rate = dropout_rate,
                #                             training = tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention[128, 10, 512] 不变
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hidden_units,
                            num_heads=num_heads,
                            dropout_rate=dropout_rate,
                            #is_training = is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hidden_units, hidden_units])

            # Final linear projection
            #self.logits = tf.layers.dense(self.dec,USER_API_LEN*3))
            # print(self.enc.get_shape().as_list())
            # print(tf.shape(self.enc))
            self.user_api_eb_sum = tf.reduce_sum(self.enc, -2)

            inp = tf.concat([
                self.item_eb, self.item_his_eb_sum, self.item_eb *
                self.item_his_eb_sum, final_state2, self.mobile_embedded,
                self.province_embedded, self.city_embedded,
                self.grade_embedded, self.chinese_embedded, self.math_embedded,
                self.english_embedded, self.purchase_embedded,
                self.activity_embedded, self.freshness_embedded,
                self.hour_embedded, self.ad_img_eb_sum, self.user_api_eb_sum
            ], -1)

        self.build_fcn_net(
            inp,
            use_dice=True,
        )
Пример #15
0
class Model(object):
	def __init__(self, mode):
		self.models = []
		self.inputs_transcript = []
		self.inputs_reference = []
		self.inputs_reference_lengths =[]
		self.inpus_speaker = []
		self.inputs_decoder = []
		self.labels = []

		self.memory = []
		self.mel_hat = []
		self.alignments = []
		self.mag_hat = []
		self.wavform = []

		for gpu_id in range(Hp.num_gpus):
            with tf.device('gpu:%d'%gpu_id):
                with tf.name_scope('tower_%d'%gpu_id) as scope:
                    with tf.variable_scope('cpu_variables', reuse=gpu_id>0):
						self.inputs_transcript.append(tf.placeholder(tf.int32, shape=[None,Hp.num_charac], name = "inputs_transcript") )#text [batch, Tx]
						
						self.inputs_reference.append(tf.placeholder(tf.float32,  #ref audio melspectrogrom [batch, Ty(?)//r, n_mels*r]
							shape=[None, None, Hp.num_mels * Hp.reduction_factor], name = "inputs_reference"))
						
						self.inputs_reference_lengths.append(tf.placeholder(tf.float32, shape=[None,1]), name = "inputs_reference_lengths")
						
						self.inpus_speaker.append(tf.placeholder(tf.int32, shape=[None,1], name = "intpus_speaker")) #speaker id [batch, 1]
						
						inpus_decoder.append(tf.placeholder(tf.float32,  # decoder melspectrogrom [batch, Ty//r, n_mels*r]
							shape=[None, None, Hp.num_mels * Hp.reduction_factor], name = "inputs_decoder") )
						
						labels.append(tf.placeholder(tf.float32, shape=[None, None, Hp.num_fft//2+1])) #magnitude
					
						training = True if mode="train" else False

						# Encoder
						# transcript encoder
						text = modules.transcript_encoder(
								inputs = self.inputs_transcript[gpu_id], 
								embed_size = Hp.charac_embed_size, 
								K = Hp.num_encoder_banks, 
								highway_layers = Hp.num_enc_highway_layers, 
								training = training)  # outputs: [Batch_size, Text length, 256]

						text =tf.identity(text, name = "text_enc")

						# reference encoder
						if mode == "train":
							batch_size = Hp.train_batch_size
						elif mode == "eval":
							batch_size = Hp.eval_batch_size
						else
							batch_size = Hp.synthes_batch_size 

						inputs_reference = tf.reshape(self.inputs_reference[gpu_id], [batch_size, -1, Hp.num_mels])
						# expand the dims inputs_reference [batch, Ty, n_mels] from 3 to 4 for conv2d [batch,Ty, n_mels, 1]
						inputs_reference = tf.expand_dims(inputs_reference, -1)
						
						prosody = modules.reference_encoder(inputs = inputs_reference, training = training) #[batch, 128]
						prosody = tf.expand_dims(prosody,1)  #[batch, 1 ,128]

						#[batch, Tx, 128] replicate prosody for all Tx steps
						prosody = tf.tile(prosody, [1, Hp.num_charac, 1], name = "prosody_enc") 
						

						# speaker
						speaker = modules.embedding(
							inputs = self.inputs_speaker[gpu_id], 
							charac_size = Hp.num_speakers, 
							embed_size = Hp.speaker_embed_size) # [batch, speaker_embed_size]
						speaker = tf.expand_dims(speaker, 1)
						speaker = tf.tile(speaker, [1, Hp.num_charac, 1], name = "speaker_embed")

						memory = tf.concat([text, prosody, speaker], axis = -1, name = "memory")  # [batch, Tx, Dt+Ds+Dp ]
						self.memory.append(memory)

						# Spectrogrom Decoder
						# we concat f0 frame and remove the last frame of original melspectrogrom since it will not be sent to the deconder 
						intpus_decoder = tf.concat((tf.zeros_like(self.intpus_decoder[gpu_id][:,:1,:]), self.intpus_decoder[gpu_id][:,:-1,:]), 1) #[batch, Ty/r, num_mels*r]

						mel_hat, alignments = attention_gru_decoder(
							inputs = inputs_decoder, 
							inputs_lengths = self.inputs_reference_lengths[gpu_id],
							memory = memory, 
							attention_rnn_nodes = Hp.num_attention_nodes, 
							decoder_rnn_nodes = Hp.num_decoder_nodes, 
							num_mels = Hp.num_mels, 
							reduction_factor = Hp.reduction_factor,
							max_iters = Hp.max_iters 
							training = training)  #[batch, Ty/r, num_mels*r]

						alignments = tf.identity(alignments, name = "alignments")
						mel_hat =tf.identity(mel_hat, name = "melspectrogrom_pred")

						mag_hat = modules.cbhg_postprocessing(
							inputs = mel_hat, 
							num_mels = self.num_mels, 
							num_fft = self.num_fft, 
							K = self.num_post_banks, 
							highway_layers = self.num_post_highway_layers, 
							training = training)  # [batch, Ty, 1+n_fft//2]
						
						mag_hat =tf.identity(mag_hat, name = "magnitude_pred")
		
						wavform = tf.py_func(signal_process.Spectrogrom2Wav, [mag_hat[0]], tf.float32, name = "wavform")

						self.mel_hat.append(mel_hat)
						self.alignments.append(alignments)
						self.mag_hat.append(mag_hat)
						self.wavform.append(wavform)